Add support for vorbis files to --extract-audio
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         )
16
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.18c'
19
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
21
22 import cookielib
23 import datetime
24 import gzip
25 import htmlentitydefs
26 import HTMLParser
27 import httplib
28 import locale
29 import math
30 import netrc
31 import os
32 import os.path
33 import re
34 import socket
35 import string
36 import subprocess
37 import sys
38 import time
39 import urllib
40 import urllib2
41 import warnings
42 import zlib
43
44 if os.name == 'nt':
45         import ctypes
46
47 try:
48         import email.utils
49 except ImportError: # Python 2.4
50         import email.Utils
51 try:
52         import cStringIO as StringIO
53 except ImportError:
54         import StringIO
55
56 # parse_qs was moved from the cgi module to the urlparse module recently.
57 try:
58         from urlparse import parse_qs
59 except ImportError:
60         from cgi import parse_qs
61
62 try:
63         import lxml.etree
64 except ImportError:
65         pass # Handled below
66
67 try:
68         import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
71
72 std_headers = {
73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76         'Accept-Encoding': 'gzip, deflate',
77         'Accept-Language': 'en-us,en;q=0.5',
78 }
79
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
82 try:
83         import json
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
85         import re
86         class json(object):
87                 @staticmethod
88                 def loads(s):
89                         s = s.decode('UTF-8')
90                         def raiseError(msg, i):
91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92                         def skipSpace(i, expectMore=True):
93                                 while i < len(s) and s[i] in ' \t\r\n':
94                                         i += 1
95                                 if expectMore:
96                                         if i >= len(s):
97                                                 raiseError('Premature end', i)
98                                 return i
99                         def decodeEscape(match):
100                                 esc = match.group(1)
101                                 _STATIC = {
102                                         '"': '"',
103                                         '\\': '\\',
104                                         '/': '/',
105                                         'b': unichr(0x8),
106                                         'f': unichr(0xc),
107                                         'n': '\n',
108                                         'r': '\r',
109                                         't': '\t',
110                                 }
111                                 if esc in _STATIC:
112                                         return _STATIC[esc]
113                                 if esc[0] == 'u':
114                                         if len(esc) == 1+4:
115                                                 return unichr(int(esc[1:5], 16))
116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
117                                                 hi = int(esc[1:5], 16)
118                                                 low = int(esc[7:11], 16)
119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120                                 raise ValueError('Unknown escape ' + str(esc))
121                         def parseString(i):
122                                 i += 1
123                                 e = i
124                                 while True:
125                                         e = s.index('"', e)
126                                         bslashes = 0
127                                         while s[e-bslashes-1] == '\\':
128                                                 bslashes += 1
129                                         if bslashes % 2 == 1:
130                                                 e += 1
131                                                 continue
132                                         break
133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134                                 stri = rexp.sub(decodeEscape, s[i:e])
135                                 return (e+1,stri)
136                         def parseObj(i):
137                                 i += 1
138                                 res = {}
139                                 i = skipSpace(i)
140                                 if s[i] == '}': # Empty dictionary
141                                         return (i+1,res)
142                                 while True:
143                                         if s[i] != '"':
144                                                 raiseError('Expected a string object key', i)
145                                         i,key = parseString(i)
146                                         i = skipSpace(i)
147                                         if i >= len(s) or s[i] != ':':
148                                                 raiseError('Expected a colon', i)
149                                         i,val = parse(i+1)
150                                         res[key] = val
151                                         i = skipSpace(i)
152                                         if s[i] == '}':
153                                                 return (i+1, res)
154                                         if s[i] != ',':
155                                                 raiseError('Expected comma or closing curly brace', i)
156                                         i = skipSpace(i+1)
157                         def parseArray(i):
158                                 res = []
159                                 i = skipSpace(i+1)
160                                 if s[i] == ']': # Empty array
161                                         return (i+1,res)
162                                 while True:
163                                         i,val = parse(i)
164                                         res.append(val)
165                                         i = skipSpace(i) # Raise exception if premature end
166                                         if s[i] == ']':
167                                                 return (i+1, res)
168                                         if s[i] != ',':
169                                                 raiseError('Expected a comma or closing bracket', i)
170                                         i = skipSpace(i+1)
171                         def parseDiscrete(i):
172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
173                                         if s.startswith(k, i):
174                                                 return (i+len(k), v)
175                                 raiseError('Not a boolean (or null)', i)
176                         def parseNumber(i):
177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178                                 if mobj is None:
179                                         raiseError('Not a number', i)
180                                 nums = mobj.group(1)
181                                 if '.' in nums or 'e' in nums or 'E' in nums:
182                                         return (i+len(nums), float(nums))
183                                 return (i+len(nums), int(nums))
184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185                         def parse(i):
186                                 i = skipSpace(i)
187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
188                                 i = skipSpace(i, False)
189                                 return (i,res)
190                         i,res = parse(0)
191                         if i < len(s):
192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193                         return res
194
195 def preferredencoding():
196         """Get preferred encoding.
197
198         Returns the best encoding scheme for the system, based on
199         locale.getpreferredencoding() and some further tweaks.
200         """
201         def yield_preferredencoding():
202                 try:
203                         pref = locale.getpreferredencoding()
204                         u'TEST'.encode(pref)
205                 except:
206                         pref = 'UTF-8'
207                 while True:
208                         yield pref
209         return yield_preferredencoding().next()
210
211
212 def htmlentity_transform(matchobj):
213         """Transforms an HTML entity to a Unicode character.
214
215         This function receives a match object and is intended to be used with
216         the re.sub() function.
217         """
218         entity = matchobj.group(1)
219
220         # Known non-numeric HTML entity
221         if entity in htmlentitydefs.name2codepoint:
222                 return unichr(htmlentitydefs.name2codepoint[entity])
223
224         # Unicode character
225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
226         if mobj is not None:
227                 numstr = mobj.group(1)
228                 if numstr.startswith(u'x'):
229                         base = 16
230                         numstr = u'0%s' % numstr
231                 else:
232                         base = 10
233                 return unichr(long(numstr, base))
234
235         # Unknown entity in name, return its literal representation
236         return (u'&%s;' % entity)
237
238
239 def sanitize_title(utitle):
240         """Sanitizes a video title so it could be used as part of a filename."""
241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242         return utitle.replace(unicode(os.sep), u'%')
243
244
245 def sanitize_open(filename, open_mode):
246         """Try to open the given filename, and slightly tweak it if this fails.
247
248         Attempts to open the given filename. If this fails, it tries to change
249         the filename slightly, step by step, until it's either able to open it
250         or it fails and raises a final exception, like the standard open()
251         function.
252
253         It returns the tuple (stream, definitive_file_name).
254         """
255         try:
256                 if filename == u'-':
257                         if sys.platform == 'win32':
258                                 import msvcrt
259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260                         return (sys.stdout, filename)
261                 stream = open(filename, open_mode)
262                 return (stream, filename)
263         except (IOError, OSError), err:
264                 # In case of error, try to remove win32 forbidden chars
265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
266
267                 # An exception here should be caught in the caller
268                 stream = open(filename, open_mode)
269                 return (stream, filename)
270
271
272 def timeconvert(timestr):
273         """Convert RFC 2822 defined time string into system timestamp"""
274         timestamp = None
275         timetuple = email.utils.parsedate_tz(timestr)
276         if timetuple is not None:
277                 timestamp = email.utils.mktime_tz(timetuple)
278         return timestamp
279
280
281 class DownloadError(Exception):
282         """Download Error exception.
283
284         This exception may be thrown by FileDownloader objects if they are not
285         configured to continue on errors. They will contain the appropriate
286         error message.
287         """
288         pass
289
290
291 class SameFileError(Exception):
292         """Same File exception.
293
294         This exception will be thrown by FileDownloader objects if they detect
295         multiple files would have to be downloaded to the same file on disk.
296         """
297         pass
298
299
300 class PostProcessingError(Exception):
301         """Post Processing exception.
302
303         This exception may be raised by PostProcessor's .run() method to
304         indicate an error in the postprocessing task.
305         """
306         pass
307
308
309 class UnavailableVideoError(Exception):
310         """Unavailable Format exception.
311
312         This exception will be thrown when a video is requested
313         in a format that is not available for that video.
314         """
315         pass
316
317
318 class ContentTooShortError(Exception):
319         """Content Too Short exception.
320
321         This exception may be raised by FileDownloader objects when a file they
322         download is too small for what the server announced first, indicating
323         the connection was probably interrupted.
324         """
325         # Both in bytes
326         downloaded = None
327         expected = None
328
329         def __init__(self, downloaded, expected):
330                 self.downloaded = downloaded
331                 self.expected = expected
332
333
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335         """Handler for HTTP requests and responses.
336
337         This class, when installed with an OpenerDirector, automatically adds
338         the standard headers to every HTTP request and handles gzipped and
339         deflated responses from web servers. If compression is to be avoided in
340         a particular request, the original request in the program code only has
341         to include the HTTP header "Youtubedl-No-Compression", which will be
342         removed before making the real request.
343
344         Part of this code was copied from:
345
346         http://techknack.net/python-urllib2-handlers/
347
348         Andrew Rowls, the author of that code, agreed to release it to the
349         public domain.
350         """
351
352         @staticmethod
353         def deflate(data):
354                 try:
355                         return zlib.decompress(data, -zlib.MAX_WBITS)
356                 except zlib.error:
357                         return zlib.decompress(data)
358
359         @staticmethod
360         def addinfourl_wrapper(stream, headers, url, code):
361                 if hasattr(urllib2.addinfourl, 'getcode'):
362                         return urllib2.addinfourl(stream, headers, url, code)
363                 ret = urllib2.addinfourl(stream, headers, url)
364                 ret.code = code
365                 return ret
366
367         def http_request(self, req):
368                 for h in std_headers:
369                         if h in req.headers:
370                                 del req.headers[h]
371                         req.add_header(h, std_headers[h])
372                 if 'Youtubedl-no-compression' in req.headers:
373                         if 'Accept-encoding' in req.headers:
374                                 del req.headers['Accept-encoding']
375                         del req.headers['Youtubedl-no-compression']
376                 return req
377
378         def http_response(self, req, resp):
379                 old_resp = resp
380                 # gzip
381                 if resp.headers.get('Content-encoding', '') == 'gzip':
382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384                         resp.msg = old_resp.msg
385                 # deflate
386                 if resp.headers.get('Content-encoding', '') == 'deflate':
387                         gz = StringIO.StringIO(self.deflate(resp.read()))
388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389                         resp.msg = old_resp.msg
390                 return resp
391
392
393 class FileDownloader(object):
394         """File Downloader class.
395
396         File downloader objects are the ones responsible of downloading the
397         actual video file and writing it to disk if the user has requested
398         it, among some other tasks. In most cases there should be one per
399         program. As, given a video URL, the downloader doesn't know how to
400         extract all the needed information, task that InfoExtractors do, it
401         has to pass the URL to one of them.
402
403         For this, file downloader objects have a method that allows
404         InfoExtractors to be registered in a given order. When it is passed
405         a URL, the file downloader handles it to the first InfoExtractor it
406         finds that reports being able to handle it. The InfoExtractor extracts
407         all the information about the video or videos the URL refers to, and
408         asks the FileDownloader to process the video information, possibly
409         downloading the video.
410
411         File downloaders accept a lot of parameters. In order not to saturate
412         the object constructor with arguments, it receives a dictionary of
413         options instead. These options are available through the params
414         attribute for the InfoExtractors to use. The FileDownloader also
415         registers itself as the downloader in charge for the InfoExtractors
416         that are added to it, so this is a "mutual registration".
417
418         Available options:
419
420         username:         Username for authentication purposes.
421         password:         Password for authentication purposes.
422         usenetrc:         Use netrc for authentication instead.
423         quiet:            Do not print messages to stdout.
424         forceurl:         Force printing final URL.
425         forcetitle:       Force printing title.
426         forcethumbnail:   Force printing thumbnail URL.
427         forcedescription: Force printing description.
428         forcefilename:    Force printing final filename.
429         simulate:         Do not download the video files.
430         format:           Video format code.
431         format_limit:     Highest quality format to try.
432         outtmpl:          Template for output names.
433         ignoreerrors:     Do not stop on download errors.
434         ratelimit:        Download speed limit, in bytes/sec.
435         nooverwrites:     Prevent overwriting files.
436         retries:          Number of times to retry for HTTP error 5xx
437         continuedl:       Try to continue downloads if possible.
438         noprogress:       Do not print the progress bar.
439         playliststart:    Playlist item to start at.
440         playlistend:      Playlist item to end at.
441         matchtitle:       Download only matching titles.
442         rejecttitle:      Reject downloads for matching titles.
443         logtostderr:      Log messages to stderr instead of stdout.
444         consoletitle:     Display progress in console window's titlebar.
445         nopart:           Do not use temporary .part files.
446         updatetime:       Use the Last-modified header to set output file timestamps.
447         writedescription: Write the video description to a .description file
448         writeinfojson:    Write the video description to a .info.json file
449         """
450
451         params = None
452         _ies = []
453         _pps = []
454         _download_retcode = None
455         _num_downloads = None
456         _screen_file = None
457
458         def __init__(self, params):
459                 """Create a FileDownloader object with the given options."""
460                 self._ies = []
461                 self._pps = []
462                 self._download_retcode = 0
463                 self._num_downloads = 0
464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
465                 self.params = params
466
467         @staticmethod
468         def format_bytes(bytes):
469                 if bytes is None:
470                         return 'N/A'
471                 if type(bytes) is str:
472                         bytes = float(bytes)
473                 if bytes == 0.0:
474                         exponent = 0
475                 else:
476                         exponent = long(math.log(bytes, 1024.0))
477                 suffix = 'bkMGTPEZY'[exponent]
478                 converted = float(bytes) / float(1024 ** exponent)
479                 return '%.2f%s' % (converted, suffix)
480
481         @staticmethod
482         def calc_percent(byte_counter, data_len):
483                 if data_len is None:
484                         return '---.-%'
485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
487         @staticmethod
488         def calc_eta(start, now, total, current):
489                 if total is None:
490                         return '--:--'
491                 dif = now - start
492                 if current == 0 or dif < 0.001: # One millisecond
493                         return '--:--'
494                 rate = float(current) / dif
495                 eta = long((float(total) - float(current)) / rate)
496                 (eta_mins, eta_secs) = divmod(eta, 60)
497                 if eta_mins > 99:
498                         return '--:--'
499                 return '%02d:%02d' % (eta_mins, eta_secs)
500
501         @staticmethod
502         def calc_speed(start, now, bytes):
503                 dif = now - start
504                 if bytes == 0 or dif < 0.001: # One millisecond
505                         return '%10s' % '---b/s'
506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
508         @staticmethod
509         def best_block_size(elapsed_time, bytes):
510                 new_min = max(bytes / 2.0, 1.0)
511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512                 if elapsed_time < 0.001:
513                         return long(new_max)
514                 rate = bytes / elapsed_time
515                 if rate > new_max:
516                         return long(new_max)
517                 if rate < new_min:
518                         return long(new_min)
519                 return long(rate)
520
521         @staticmethod
522         def parse_bytes(bytestr):
523                 """Parse a string indicating a byte quantity into a long integer."""
524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525                 if matchobj is None:
526                         return None
527                 number = float(matchobj.group(1))
528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529                 return long(round(number * multiplier))
530
531         def add_info_extractor(self, ie):
532                 """Add an InfoExtractor object to the end of the list."""
533                 self._ies.append(ie)
534                 ie.set_downloader(self)
535
536         def add_post_processor(self, pp):
537                 """Add a PostProcessor object to the end of the chain."""
538                 self._pps.append(pp)
539                 pp.set_downloader(self)
540
541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542                 """Print message to stdout if not in quiet mode."""
543                 try:
544                         if not self.params.get('quiet', False):
545                                 terminator = [u'\n', u''][skip_eol]
546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547                         self._screen_file.flush()
548                 except (UnicodeEncodeError), err:
549                         if not ignore_encoding_errors:
550                                 raise
551
552         def to_stderr(self, message):
553                 """Print message to stderr."""
554                 print >>sys.stderr, message.encode(preferredencoding())
555
556         def to_cons_title(self, message):
557                 """Set console/terminal window title to message."""
558                 if not self.params.get('consoletitle', False):
559                         return
560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561                         # c_wchar_p() might not be necessary if `message` is
562                         # already of type unicode()
563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564                 elif 'TERM' in os.environ:
565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
567         def fixed_template(self):
568                 """Checks if the output template is fixed."""
569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
570
571         def trouble(self, message=None):
572                 """Determine action to take when a download problem appears.
573
574                 Depending on if the downloader has been configured to ignore
575                 download errors or not, this method may throw an exception or
576                 not when errors are found, after printing the message.
577                 """
578                 if message is not None:
579                         self.to_stderr(message)
580                 if not self.params.get('ignoreerrors', False):
581                         raise DownloadError(message)
582                 self._download_retcode = 1
583
584         def slow_down(self, start_time, byte_counter):
585                 """Sleep if the download speed is over the rate limit."""
586                 rate_limit = self.params.get('ratelimit', None)
587                 if rate_limit is None or byte_counter == 0:
588                         return
589                 now = time.time()
590                 elapsed = now - start_time
591                 if elapsed <= 0.0:
592                         return
593                 speed = float(byte_counter) / elapsed
594                 if speed > rate_limit:
595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
596
597         def temp_name(self, filename):
598                 """Returns a temporary filename for the given filename."""
599                 if self.params.get('nopart', False) or filename == u'-' or \
600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
601                         return filename
602                 return filename + u'.part'
603
604         def undo_temp_name(self, filename):
605                 if filename.endswith(u'.part'):
606                         return filename[:-len(u'.part')]
607                 return filename
608
609         def try_rename(self, old_filename, new_filename):
610                 try:
611                         if old_filename == new_filename:
612                                 return
613                         os.rename(old_filename, new_filename)
614                 except (IOError, OSError), err:
615                         self.trouble(u'ERROR: unable to rename file')
616
617         def try_utime(self, filename, last_modified_hdr):
618                 """Try to set the last-modified time of the given file."""
619                 if last_modified_hdr is None:
620                         return
621                 if not os.path.isfile(filename):
622                         return
623                 timestr = last_modified_hdr
624                 if timestr is None:
625                         return
626                 filetime = timeconvert(timestr)
627                 if filetime is None:
628                         return filetime
629                 try:
630                         os.utime(filename, (time.time(), filetime))
631                 except:
632                         pass
633                 return filetime
634
635         def report_writedescription(self, descfn):
636                 """ Report that the description file is being written """
637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638
639         def report_writeinfojson(self, infofn):
640                 """ Report that the metadata file has been written """
641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
642
643         def report_destination(self, filename):
644                 """Report destination filename."""
645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
646
647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648                 """Report download progress."""
649                 if self.params.get('noprogress', False):
650                         return
651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
655
656         def report_resuming_byte(self, resume_len):
657                 """Report attempt to resume at given byte."""
658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
659
660         def report_retry(self, count, retries):
661                 """Report retry in case of HTTP error 5xx"""
662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
663
664         def report_file_already_downloaded(self, file_name):
665                 """Report file has already been fully downloaded."""
666                 try:
667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
668                 except (UnicodeEncodeError), err:
669                         self.to_screen(u'[download] The file has already been downloaded')
670
671         def report_unable_to_resume(self):
672                 """Report it was impossible to resume download."""
673                 self.to_screen(u'[download] Unable to resume')
674
675         def report_finish(self):
676                 """Report download finished."""
677                 if self.params.get('noprogress', False):
678                         self.to_screen(u'[download] Download completed')
679                 else:
680                         self.to_screen(u'')
681
682         def increment_downloads(self):
683                 """Increment the ordinal that assigns a number to each file."""
684                 self._num_downloads += 1
685
686         def prepare_filename(self, info_dict):
687                 """Generate the output filename."""
688                 try:
689                         template_dict = dict(info_dict)
690                         template_dict['epoch'] = unicode(long(time.time()))
691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692                         filename = self.params['outtmpl'] % template_dict
693                         return filename
694                 except (ValueError, KeyError), err:
695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
696                         return None
697
698         def process_info(self, info_dict):
699                 """Process a single dictionary returned by an InfoExtractor."""
700                 filename = self.prepare_filename(info_dict)
701                 
702                 # Forced printings
703                 if self.params.get('forcetitle', False):
704                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705                 if self.params.get('forceurl', False):
706                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709                 if self.params.get('forcedescription', False) and 'description' in info_dict:
710                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711                 if self.params.get('forcefilename', False) and filename is not None:
712                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713                 if self.params.get('forceformat', False):
714                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
715
716                 # Do nothing else if in simulate mode
717                 if self.params.get('simulate', False):
718                         return
719
720                 if filename is None:
721                         return
722
723                 matchtitle=self.params.get('matchtitle',False)
724                 rejecttitle=self.params.get('rejecttitle',False)
725                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
728                         return
729                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
731                         return
732                         
733                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734                         self.to_stderr(u'WARNING: file exists and will be skipped')
735                         return
736
737                 try:
738                         dn = os.path.dirname(filename)
739                         if dn != '' and not os.path.exists(dn):
740                                 os.makedirs(dn)
741                 except (OSError, IOError), err:
742                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
743                         return
744
745                 if self.params.get('writedescription', False):
746                         try:
747                                 descfn = filename + '.description'
748                                 self.report_writedescription(descfn)
749                                 descfile = open(descfn, 'wb')
750                                 try:
751                                         descfile.write(info_dict['description'].encode('utf-8'))
752                                 finally:
753                                         descfile.close()
754                         except (OSError, IOError):
755                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
756                                 return
757
758                 if self.params.get('writeinfojson', False):
759                         infofn = filename + '.info.json'
760                         self.report_writeinfojson(infofn)
761                         try:
762                                 json.dump
763                         except (NameError,AttributeError):
764                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
765                                 return
766                         try:
767                                 infof = open(infofn, 'wb')
768                                 try:
769                                         json.dump(info_dict, infof)
770                                 finally:
771                                         infof.close()
772                         except (OSError, IOError):
773                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
774                                 return
775
776                 if not self.params.get('skip_download', False):
777                         try:
778                                 success = self._do_download(filename, info_dict)
779                         except (OSError, IOError), err:
780                                 raise UnavailableVideoError
781                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
782                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
783                                 return
784                         except (ContentTooShortError, ), err:
785                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
786                                 return
787         
788                         if success:
789                                 try:
790                                         self.post_process(filename, info_dict)
791                                 except (PostProcessingError), err:
792                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
793                                         return
794
795         def download(self, url_list):
796                 """Download a given list of URLs."""
797                 if len(url_list) > 1 and self.fixed_template():
798                         raise SameFileError(self.params['outtmpl'])
799
800                 for url in url_list:
801                         suitable_found = False
802                         for ie in self._ies:
803                                 # Go to next InfoExtractor if not suitable
804                                 if not ie.suitable(url):
805                                         continue
806
807                                 # Suitable InfoExtractor found
808                                 suitable_found = True
809
810                                 # Extract information from URL and process it
811                                 ie.extract(url)
812
813                                 # Suitable InfoExtractor had been found; go to next URL
814                                 break
815
816                         if not suitable_found:
817                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
818
819                 return self._download_retcode
820
821         def post_process(self, filename, ie_info):
822                 """Run the postprocessing chain on the given file."""
823                 info = dict(ie_info)
824                 info['filepath'] = filename
825                 for pp in self._pps:
826                         info = pp.run(info)
827                         if info is None:
828                                 break
829
830         def _download_with_rtmpdump(self, filename, url, player_url):
831                 self.report_destination(filename)
832                 tmpfilename = self.temp_name(filename)
833
834                 # Check for rtmpdump first
835                 try:
836                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
837                 except (OSError, IOError):
838                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
839                         return False
840
841                 # Download using rtmpdump. rtmpdump returns exit code 2 when
842                 # the connection was interrumpted and resuming appears to be
843                 # possible. This is part of rtmpdump's normal usage, AFAIK.
844                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
845                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
846                 while retval == 2 or retval == 1:
847                         prevsize = os.path.getsize(tmpfilename)
848                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
849                         time.sleep(5.0) # This seems to be needed
850                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
851                         cursize = os.path.getsize(tmpfilename)
852                         if prevsize == cursize and retval == 1:
853                                 break
854                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
855                         if prevsize == cursize and retval == 2 and cursize > 1024:
856                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
857                                 retval = 0
858                                 break
859                 if retval == 0:
860                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
861                         self.try_rename(tmpfilename, filename)
862                         return True
863                 else:
864                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
865                         return False
866
867         def _do_download(self, filename, info_dict):
868                 url = info_dict['url']
869                 player_url = info_dict.get('player_url', None)
870
871                 # Check file already present
872                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
873                         self.report_file_already_downloaded(filename)
874                         return True
875
876                 # Attempt to download using rtmpdump
877                 if url.startswith('rtmp'):
878                         return self._download_with_rtmpdump(filename, url, player_url)
879
880                 tmpfilename = self.temp_name(filename)
881                 stream = None
882
883                 # Do not include the Accept-Encoding header
884                 headers = {'Youtubedl-no-compression': 'True'}
885                 basic_request = urllib2.Request(url, None, headers)
886                 request = urllib2.Request(url, None, headers)
887
888                 # Establish possible resume length
889                 if os.path.isfile(tmpfilename):
890                         resume_len = os.path.getsize(tmpfilename)
891                 else:
892                         resume_len = 0
893
894                 open_mode = 'wb'
895                 if resume_len != 0:
896                         if self.params.get('continuedl', False):
897                                 self.report_resuming_byte(resume_len)
898                                 request.add_header('Range','bytes=%d-' % resume_len)
899                                 open_mode = 'ab'
900                         else:
901                                 resume_len = 0
902
903                 count = 0
904                 retries = self.params.get('retries', 0)
905                 while count <= retries:
906                         # Establish connection
907                         try:
908                                 data = urllib2.urlopen(request)
909                                 break
910                         except (urllib2.HTTPError, ), err:
911                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
912                                         # Unexpected HTTP error
913                                         raise
914                                 elif err.code == 416:
915                                         # Unable to resume (requested range not satisfiable)
916                                         try:
917                                                 # Open the connection again without the range header
918                                                 data = urllib2.urlopen(basic_request)
919                                                 content_length = data.info()['Content-Length']
920                                         except (urllib2.HTTPError, ), err:
921                                                 if err.code < 500 or err.code >= 600:
922                                                         raise
923                                         else:
924                                                 # Examine the reported length
925                                                 if (content_length is not None and
926                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
927                                                         # The file had already been fully downloaded.
928                                                         # Explanation to the above condition: in issue #175 it was revealed that
929                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
930                                                         # changing the file size slightly and causing problems for some users. So
931                                                         # I decided to implement a suggested change and consider the file
932                                                         # completely downloaded if the file size differs less than 100 bytes from
933                                                         # the one in the hard drive.
934                                                         self.report_file_already_downloaded(filename)
935                                                         self.try_rename(tmpfilename, filename)
936                                                         return True
937                                                 else:
938                                                         # The length does not match, we start the download over
939                                                         self.report_unable_to_resume()
940                                                         open_mode = 'wb'
941                                                         break
942                         # Retry
943                         count += 1
944                         if count <= retries:
945                                 self.report_retry(count, retries)
946
947                 if count > retries:
948                         self.trouble(u'ERROR: giving up after %s retries' % retries)
949                         return False
950
951                 data_len = data.info().get('Content-length', None)
952                 if data_len is not None:
953                         data_len = long(data_len) + resume_len
954                 data_len_str = self.format_bytes(data_len)
955                 byte_counter = 0 + resume_len
956                 block_size = 1024
957                 start = time.time()
958                 while True:
959                         # Download and write
960                         before = time.time()
961                         data_block = data.read(block_size)
962                         after = time.time()
963                         if len(data_block) == 0:
964                                 break
965                         byte_counter += len(data_block)
966
967                         # Open file just in time
968                         if stream is None:
969                                 try:
970                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
971                                         assert stream is not None
972                                         filename = self.undo_temp_name(tmpfilename)
973                                         self.report_destination(filename)
974                                 except (OSError, IOError), err:
975                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
976                                         return False
977                         try:
978                                 stream.write(data_block)
979                         except (IOError, OSError), err:
980                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
981                                 return False
982                         block_size = self.best_block_size(after - before, len(data_block))
983
984                         # Progress message
985                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
986                         if data_len is None:
987                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
988                         else:
989                                 percent_str = self.calc_percent(byte_counter, data_len)
990                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
991                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
992
993                         # Apply rate limit
994                         self.slow_down(start, byte_counter - resume_len)
995
996                 if stream is None:
997                         self.trouble(u'\nERROR: Did not get any data blocks')
998                         return False
999                 stream.close()
1000                 self.report_finish()
1001                 if data_len is not None and byte_counter != data_len:
1002                         raise ContentTooShortError(byte_counter, long(data_len))
1003                 self.try_rename(tmpfilename, filename)
1004
1005                 # Update file modification time
1006                 if self.params.get('updatetime', True):
1007                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1008
1009                 return True
1010
1011
1012 class InfoExtractor(object):
1013         """Information Extractor class.
1014
1015         Information extractors are the classes that, given a URL, extract
1016         information from the video (or videos) the URL refers to. This
1017         information includes the real video URL, the video title and simplified
1018         title, author and others. The information is stored in a dictionary
1019         which is then passed to the FileDownloader. The FileDownloader
1020         processes this information possibly downloading the video to the file
1021         system, among other possible outcomes. The dictionaries must include
1022         the following fields:
1023
1024         id:             Video identifier.
1025         url:            Final video URL.
1026         uploader:       Nickname of the video uploader.
1027         title:          Literal title.
1028         stitle:         Simplified title.
1029         ext:            Video filename extension.
1030         format:         Video format.
1031         player_url:     SWF Player URL (may be None).
1032
1033         The following fields are optional. Their primary purpose is to allow
1034         youtube-dl to serve as the backend for a video search function, such
1035         as the one in youtube2mp3.  They are only used when their respective
1036         forced printing functions are called:
1037
1038         thumbnail:      Full URL to a video thumbnail image.
1039         description:    One-line video description.
1040
1041         Subclasses of this one should re-define the _real_initialize() and
1042         _real_extract() methods and define a _VALID_URL regexp.
1043         Probably, they should also be added to the list of extractors.
1044         """
1045
1046         _ready = False
1047         _downloader = None
1048
1049         def __init__(self, downloader=None):
1050                 """Constructor. Receives an optional downloader."""
1051                 self._ready = False
1052                 self.set_downloader(downloader)
1053
1054         def suitable(self, url):
1055                 """Receives a URL and returns True if suitable for this IE."""
1056                 return re.match(self._VALID_URL, url) is not None
1057
1058         def initialize(self):
1059                 """Initializes an instance (authentication, etc)."""
1060                 if not self._ready:
1061                         self._real_initialize()
1062                         self._ready = True
1063
1064         def extract(self, url):
1065                 """Extracts URL information and returns it in list of dicts."""
1066                 self.initialize()
1067                 return self._real_extract(url)
1068
1069         def set_downloader(self, downloader):
1070                 """Sets the downloader for this IE."""
1071                 self._downloader = downloader
1072
1073         def _real_initialize(self):
1074                 """Real initialization process. Redefine in subclasses."""
1075                 pass
1076
1077         def _real_extract(self, url):
1078                 """Real extraction process. Redefine in subclasses."""
1079                 pass
1080
1081
1082 class YoutubeIE(InfoExtractor):
1083         """Information extractor for youtube.com."""
1084
1085         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1086         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1087         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1088         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1089         _NETRC_MACHINE = 'youtube'
1090         # Listed in order of quality
1091         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1092         _video_extensions = {
1093                 '13': '3gp',
1094                 '17': 'mp4',
1095                 '18': 'mp4',
1096                 '22': 'mp4',
1097                 '37': 'mp4',
1098                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1099                 '43': 'webm',
1100                 '44': 'webm',
1101                 '45': 'webm',
1102         }
1103         IE_NAME = u'youtube'
1104
1105         def report_lang(self):
1106                 """Report attempt to set language."""
1107                 self._downloader.to_screen(u'[youtube] Setting language')
1108
1109         def report_login(self):
1110                 """Report attempt to log in."""
1111                 self._downloader.to_screen(u'[youtube] Logging in')
1112
1113         def report_age_confirmation(self):
1114                 """Report attempt to confirm age."""
1115                 self._downloader.to_screen(u'[youtube] Confirming age')
1116
1117         def report_video_webpage_download(self, video_id):
1118                 """Report attempt to download video webpage."""
1119                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1120
1121         def report_video_info_webpage_download(self, video_id):
1122                 """Report attempt to download video info webpage."""
1123                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1124
1125         def report_information_extraction(self, video_id):
1126                 """Report attempt to extract video information."""
1127                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1128
1129         def report_unavailable_format(self, video_id, format):
1130                 """Report extracted video URL."""
1131                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1132
1133         def report_rtmp_download(self):
1134                 """Indicate the download will use the RTMP protocol."""
1135                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1136
1137         def _real_initialize(self):
1138                 if self._downloader is None:
1139                         return
1140
1141                 username = None
1142                 password = None
1143                 downloader_params = self._downloader.params
1144
1145                 # Attempt to use provided username and password or .netrc data
1146                 if downloader_params.get('username', None) is not None:
1147                         username = downloader_params['username']
1148                         password = downloader_params['password']
1149                 elif downloader_params.get('usenetrc', False):
1150                         try:
1151                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1152                                 if info is not None:
1153                                         username = info[0]
1154                                         password = info[2]
1155                                 else:
1156                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1157                         except (IOError, netrc.NetrcParseError), err:
1158                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1159                                 return
1160
1161                 # Set language
1162                 request = urllib2.Request(self._LANG_URL)
1163                 try:
1164                         self.report_lang()
1165                         urllib2.urlopen(request).read()
1166                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1167                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1168                         return
1169
1170                 # No authentication to be performed
1171                 if username is None:
1172                         return
1173
1174                 # Log in
1175                 login_form = {
1176                                 'current_form': 'loginForm',
1177                                 'next':         '/',
1178                                 'action_login': 'Log In',
1179                                 'username':     username,
1180                                 'password':     password,
1181                                 }
1182                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1183                 try:
1184                         self.report_login()
1185                         login_results = urllib2.urlopen(request).read()
1186                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1187                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1188                                 return
1189                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1191                         return
1192
1193                 # Confirm age
1194                 age_form = {
1195                                 'next_url':             '/',
1196                                 'action_confirm':       'Confirm',
1197                                 }
1198                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1199                 try:
1200                         self.report_age_confirmation()
1201                         age_results = urllib2.urlopen(request).read()
1202                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1203                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1204                         return
1205
1206         def _real_extract(self, url):
1207                 # Extract video id from URL
1208                 mobj = re.match(self._VALID_URL, url)
1209                 if mobj is None:
1210                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1211                         return
1212                 video_id = mobj.group(2)
1213
1214                 # Get video webpage
1215                 self.report_video_webpage_download(video_id)
1216                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1217                 try:
1218                         video_webpage = urllib2.urlopen(request).read()
1219                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1221                         return
1222
1223                 # Attempt to extract SWF player URL
1224                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1225                 if mobj is not None:
1226                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1227                 else:
1228                         player_url = None
1229
1230                 # Get video info
1231                 self.report_video_info_webpage_download(video_id)
1232                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1233                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1234                                         % (video_id, el_type))
1235                         request = urllib2.Request(video_info_url)
1236                         try:
1237                                 video_info_webpage = urllib2.urlopen(request).read()
1238                                 video_info = parse_qs(video_info_webpage)
1239                                 if 'token' in video_info:
1240                                         break
1241                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1242                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1243                                 return
1244                 if 'token' not in video_info:
1245                         if 'reason' in video_info:
1246                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1247                         else:
1248                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1249                         return
1250
1251                 # Start extracting information
1252                 self.report_information_extraction(video_id)
1253
1254                 # uploader
1255                 if 'author' not in video_info:
1256                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1257                         return
1258                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1259
1260                 # title
1261                 if 'title' not in video_info:
1262                         self._downloader.trouble(u'ERROR: unable to extract video title')
1263                         return
1264                 video_title = urllib.unquote_plus(video_info['title'][0])
1265                 video_title = video_title.decode('utf-8')
1266                 video_title = sanitize_title(video_title)
1267
1268                 # simplified title
1269                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1270                 simple_title = simple_title.strip(ur'_')
1271
1272                 # thumbnail image
1273                 if 'thumbnail_url' not in video_info:
1274                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1275                         video_thumbnail = ''
1276                 else:   # don't panic if we can't find it
1277                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1278
1279                 # upload date
1280                 upload_date = u'NA'
1281                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1282                 if mobj is not None:
1283                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1284                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1285                         for expression in format_expressions:
1286                                 try:
1287                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1288                                 except:
1289                                         pass
1290
1291                 # description
1292                 try:
1293                         lxml.etree
1294                 except NameError:
1295                         video_description = u'No description available.'
1296                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1297                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1298                                 if mobj is not None:
1299                                         video_description = mobj.group(1).decode('utf-8')
1300                 else:
1301                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1302                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1303                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1304                         # TODO use another parser
1305
1306                 # token
1307                 video_token = urllib.unquote_plus(video_info['token'][0])
1308
1309                 # Decide which formats to download
1310                 req_format = self._downloader.params.get('format', None)
1311
1312                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1313                         self.report_rtmp_download()
1314                         video_url_list = [(None, video_info['conn'][0])]
1315                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1316                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1317                         url_data = [parse_qs(uds) for uds in url_data_strs]
1318                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1319                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1320
1321                         format_limit = self._downloader.params.get('format_limit', None)
1322                         if format_limit is not None and format_limit in self._available_formats:
1323                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1324                         else:
1325                                 format_list = self._available_formats
1326                         existing_formats = [x for x in format_list if x in url_map]
1327                         if len(existing_formats) == 0:
1328                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1329                                 return
1330                         if req_format is None or req_format == 'best':
1331                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1332                         elif req_format == 'worst':
1333                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1334                         elif req_format in ('-1', 'all'):
1335                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1336                         else:
1337                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1338                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1339                                 req_formats = req_format.split('/')
1340                                 video_url_list = None
1341                                 for rf in req_formats:
1342                                         if rf in url_map:
1343                                                 video_url_list = [(rf, url_map[rf])]
1344                                                 break
1345                                 if video_url_list is None:
1346                                         self._downloader.trouble(u'ERROR: requested format not available')
1347                                         return
1348                 else:
1349                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1350                         return
1351
1352                 for format_param, video_real_url in video_url_list:
1353                         # At this point we have a new video
1354                         self._downloader.increment_downloads()
1355
1356                         # Extension
1357                         video_extension = self._video_extensions.get(format_param, 'flv')
1358
1359                         try:
1360                                 # Process video information
1361                                 self._downloader.process_info({
1362                                         'id':           video_id.decode('utf-8'),
1363                                         'url':          video_real_url.decode('utf-8'),
1364                                         'uploader':     video_uploader.decode('utf-8'),
1365                                         'upload_date':  upload_date,
1366                                         'title':        video_title,
1367                                         'stitle':       simple_title,
1368                                         'ext':          video_extension.decode('utf-8'),
1369                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1370                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1371                                         'description':  video_description,
1372                                         'player_url':   player_url,
1373                                 })
1374                         except UnavailableVideoError, err:
1375                                 self._downloader.trouble(u'\nERROR: unable to download video')
1376
1377
1378 class MetacafeIE(InfoExtractor):
1379         """Information Extractor for metacafe.com."""
1380
1381         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1382         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1383         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1384         _youtube_ie = None
1385         IE_NAME = u'metacafe'
1386
1387         def __init__(self, youtube_ie, downloader=None):
1388                 InfoExtractor.__init__(self, downloader)
1389                 self._youtube_ie = youtube_ie
1390
1391         def report_disclaimer(self):
1392                 """Report disclaimer retrieval."""
1393                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1394
1395         def report_age_confirmation(self):
1396                 """Report attempt to confirm age."""
1397                 self._downloader.to_screen(u'[metacafe] Confirming age')
1398
1399         def report_download_webpage(self, video_id):
1400                 """Report webpage download."""
1401                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1402
1403         def report_extraction(self, video_id):
1404                 """Report information extraction."""
1405                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1406
1407         def _real_initialize(self):
1408                 # Retrieve disclaimer
1409                 request = urllib2.Request(self._DISCLAIMER)
1410                 try:
1411                         self.report_disclaimer()
1412                         disclaimer = urllib2.urlopen(request).read()
1413                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1414                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1415                         return
1416
1417                 # Confirm age
1418                 disclaimer_form = {
1419                         'filters': '0',
1420                         'submit': "Continue - I'm over 18",
1421                         }
1422                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1423                 try:
1424                         self.report_age_confirmation()
1425                         disclaimer = urllib2.urlopen(request).read()
1426                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1427                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1428                         return
1429
1430         def _real_extract(self, url):
1431                 # Extract id and simplified title from URL
1432                 mobj = re.match(self._VALID_URL, url)
1433                 if mobj is None:
1434                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1435                         return
1436
1437                 video_id = mobj.group(1)
1438
1439                 # Check if video comes from YouTube
1440                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1441                 if mobj2 is not None:
1442                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1443                         return
1444
1445                 # At this point we have a new video
1446                 self._downloader.increment_downloads()
1447
1448                 simple_title = mobj.group(2).decode('utf-8')
1449
1450                 # Retrieve video webpage to extract further information
1451                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1452                 try:
1453                         self.report_download_webpage(video_id)
1454                         webpage = urllib2.urlopen(request).read()
1455                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1456                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1457                         return
1458
1459                 # Extract URL, uploader and title from webpage
1460                 self.report_extraction(video_id)
1461                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1462                 if mobj is not None:
1463                         mediaURL = urllib.unquote(mobj.group(1))
1464                         video_extension = mediaURL[-3:]
1465
1466                         # Extract gdaKey if available
1467                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1468                         if mobj is None:
1469                                 video_url = mediaURL
1470                         else:
1471                                 gdaKey = mobj.group(1)
1472                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1473                 else:
1474                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1475                         if mobj is None:
1476                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1477                                 return
1478                         vardict = parse_qs(mobj.group(1))
1479                         if 'mediaData' not in vardict:
1480                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1481                                 return
1482                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1483                         if mobj is None:
1484                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1485                                 return
1486                         mediaURL = mobj.group(1).replace('\\/', '/')
1487                         video_extension = mediaURL[-3:]
1488                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1489
1490                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1491                 if mobj is None:
1492                         self._downloader.trouble(u'ERROR: unable to extract title')
1493                         return
1494                 video_title = mobj.group(1).decode('utf-8')
1495                 video_title = sanitize_title(video_title)
1496
1497                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1498                 if mobj is None:
1499                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1500                         return
1501                 video_uploader = mobj.group(1)
1502
1503                 try:
1504                         # Process video information
1505                         self._downloader.process_info({
1506                                 'id':           video_id.decode('utf-8'),
1507                                 'url':          video_url.decode('utf-8'),
1508                                 'uploader':     video_uploader.decode('utf-8'),
1509                                 'upload_date':  u'NA',
1510                                 'title':        video_title,
1511                                 'stitle':       simple_title,
1512                                 'ext':          video_extension.decode('utf-8'),
1513                                 'format':       u'NA',
1514                                 'player_url':   None,
1515                         })
1516                 except UnavailableVideoError:
1517                         self._downloader.trouble(u'\nERROR: unable to download video')
1518
1519
1520 class DailymotionIE(InfoExtractor):
1521         """Information Extractor for Dailymotion"""
1522
1523         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1524         IE_NAME = u'dailymotion'
1525
1526         def __init__(self, downloader=None):
1527                 InfoExtractor.__init__(self, downloader)
1528
1529         def report_download_webpage(self, video_id):
1530                 """Report webpage download."""
1531                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1532
1533         def report_extraction(self, video_id):
1534                 """Report information extraction."""
1535                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1536
1537         def _real_initialize(self):
1538                 return
1539
1540         def _real_extract(self, url):
1541                 # Extract id and simplified title from URL
1542                 mobj = re.match(self._VALID_URL, url)
1543                 if mobj is None:
1544                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1545                         return
1546
1547                 # At this point we have a new video
1548                 self._downloader.increment_downloads()
1549                 video_id = mobj.group(1)
1550
1551                 simple_title = mobj.group(2).decode('utf-8')
1552                 video_extension = 'flv'
1553
1554                 # Retrieve video webpage to extract further information
1555                 request = urllib2.Request(url)
1556                 request.add_header('Cookie', 'family_filter=off')
1557                 try:
1558                         self.report_download_webpage(video_id)
1559                         webpage = urllib2.urlopen(request).read()
1560                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1561                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1562                         return
1563
1564                 # Extract URL, uploader and title from webpage
1565                 self.report_extraction(video_id)
1566                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1567                 if mobj is None:
1568                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1569                         return
1570                 sequence = urllib.unquote(mobj.group(1))
1571                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1572                 if mobj is None:
1573                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1574                         return
1575                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1576
1577                 # if needed add http://www.dailymotion.com/ if relative URL
1578
1579                 video_url = mediaURL
1580
1581                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1582                 if mobj is None:
1583                         self._downloader.trouble(u'ERROR: unable to extract title')
1584                         return
1585                 video_title = mobj.group(1).decode('utf-8')
1586                 video_title = sanitize_title(video_title)
1587
1588                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1589                 if mobj is None:
1590                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1591                         return
1592                 video_uploader = mobj.group(1)
1593
1594                 try:
1595                         # Process video information
1596                         self._downloader.process_info({
1597                                 'id':           video_id.decode('utf-8'),
1598                                 'url':          video_url.decode('utf-8'),
1599                                 'uploader':     video_uploader.decode('utf-8'),
1600                                 'upload_date':  u'NA',
1601                                 'title':        video_title,
1602                                 'stitle':       simple_title,
1603                                 'ext':          video_extension.decode('utf-8'),
1604                                 'format':       u'NA',
1605                                 'player_url':   None,
1606                         })
1607                 except UnavailableVideoError:
1608                         self._downloader.trouble(u'\nERROR: unable to download video')
1609
1610
1611 class GoogleIE(InfoExtractor):
1612         """Information extractor for video.google.com."""
1613
1614         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1615         IE_NAME = u'video.google'
1616
1617         def __init__(self, downloader=None):
1618                 InfoExtractor.__init__(self, downloader)
1619
1620         def report_download_webpage(self, video_id):
1621                 """Report webpage download."""
1622                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1623
1624         def report_extraction(self, video_id):
1625                 """Report information extraction."""
1626                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1627
1628         def _real_initialize(self):
1629                 return
1630
1631         def _real_extract(self, url):
1632                 # Extract id from URL
1633                 mobj = re.match(self._VALID_URL, url)
1634                 if mobj is None:
1635                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1636                         return
1637
1638                 # At this point we have a new video
1639                 self._downloader.increment_downloads()
1640                 video_id = mobj.group(1)
1641
1642                 video_extension = 'mp4'
1643
1644                 # Retrieve video webpage to extract further information
1645                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1646                 try:
1647                         self.report_download_webpage(video_id)
1648                         webpage = urllib2.urlopen(request).read()
1649                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1650                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1651                         return
1652
1653                 # Extract URL, uploader, and title from webpage
1654                 self.report_extraction(video_id)
1655                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1656                 if mobj is None:
1657                         video_extension = 'flv'
1658                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1659                 if mobj is None:
1660                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1661                         return
1662                 mediaURL = urllib.unquote(mobj.group(1))
1663                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1664                 mediaURL = mediaURL.replace('\\x26', '\x26')
1665
1666                 video_url = mediaURL
1667
1668                 mobj = re.search(r'<title>(.*)</title>', webpage)
1669                 if mobj is None:
1670                         self._downloader.trouble(u'ERROR: unable to extract title')
1671                         return
1672                 video_title = mobj.group(1).decode('utf-8')
1673                 video_title = sanitize_title(video_title)
1674                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1675
1676                 # Extract video description
1677                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1678                 if mobj is None:
1679                         self._downloader.trouble(u'ERROR: unable to extract video description')
1680                         return
1681                 video_description = mobj.group(1).decode('utf-8')
1682                 if not video_description:
1683                         video_description = 'No description available.'
1684
1685                 # Extract video thumbnail
1686                 if self._downloader.params.get('forcethumbnail', False):
1687                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1688                         try:
1689                                 webpage = urllib2.urlopen(request).read()
1690                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1691                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1692                                 return
1693                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1694                         if mobj is None:
1695                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1696                                 return
1697                         video_thumbnail = mobj.group(1)
1698                 else:   # we need something to pass to process_info
1699                         video_thumbnail = ''
1700
1701                 try:
1702                         # Process video information
1703                         self._downloader.process_info({
1704                                 'id':           video_id.decode('utf-8'),
1705                                 'url':          video_url.decode('utf-8'),
1706                                 'uploader':     u'NA',
1707                                 'upload_date':  u'NA',
1708                                 'title':        video_title,
1709                                 'stitle':       simple_title,
1710                                 'ext':          video_extension.decode('utf-8'),
1711                                 'format':       u'NA',
1712                                 'player_url':   None,
1713                         })
1714                 except UnavailableVideoError:
1715                         self._downloader.trouble(u'\nERROR: unable to download video')
1716
1717
1718 class PhotobucketIE(InfoExtractor):
1719         """Information extractor for photobucket.com."""
1720
1721         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1722         IE_NAME = u'photobucket'
1723
1724         def __init__(self, downloader=None):
1725                 InfoExtractor.__init__(self, downloader)
1726
1727         def report_download_webpage(self, video_id):
1728                 """Report webpage download."""
1729                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1730
1731         def report_extraction(self, video_id):
1732                 """Report information extraction."""
1733                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1734
1735         def _real_initialize(self):
1736                 return
1737
1738         def _real_extract(self, url):
1739                 # Extract id from URL
1740                 mobj = re.match(self._VALID_URL, url)
1741                 if mobj is None:
1742                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1743                         return
1744
1745                 # At this point we have a new video
1746                 self._downloader.increment_downloads()
1747                 video_id = mobj.group(1)
1748
1749                 video_extension = 'flv'
1750
1751                 # Retrieve video webpage to extract further information
1752                 request = urllib2.Request(url)
1753                 try:
1754                         self.report_download_webpage(video_id)
1755                         webpage = urllib2.urlopen(request).read()
1756                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1757                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1758                         return
1759
1760                 # Extract URL, uploader, and title from webpage
1761                 self.report_extraction(video_id)
1762                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1763                 if mobj is None:
1764                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1765                         return
1766                 mediaURL = urllib.unquote(mobj.group(1))
1767
1768                 video_url = mediaURL
1769
1770                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1771                 if mobj is None:
1772                         self._downloader.trouble(u'ERROR: unable to extract title')
1773                         return
1774                 video_title = mobj.group(1).decode('utf-8')
1775                 video_title = sanitize_title(video_title)
1776                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1777
1778                 video_uploader = mobj.group(2).decode('utf-8')
1779
1780                 try:
1781                         # Process video information
1782                         self._downloader.process_info({
1783                                 'id':           video_id.decode('utf-8'),
1784                                 'url':          video_url.decode('utf-8'),
1785                                 'uploader':     video_uploader,
1786                                 'upload_date':  u'NA',
1787                                 'title':        video_title,
1788                                 'stitle':       simple_title,
1789                                 'ext':          video_extension.decode('utf-8'),
1790                                 'format':       u'NA',
1791                                 'player_url':   None,
1792                         })
1793                 except UnavailableVideoError:
1794                         self._downloader.trouble(u'\nERROR: unable to download video')
1795
1796
1797 class YahooIE(InfoExtractor):
1798         """Information extractor for video.yahoo.com."""
1799
1800         # _VALID_URL matches all Yahoo! Video URLs
1801         # _VPAGE_URL matches only the extractable '/watch/' URLs
1802         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1803         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1804         IE_NAME = u'video.yahoo'
1805
1806         def __init__(self, downloader=None):
1807                 InfoExtractor.__init__(self, downloader)
1808
1809         def report_download_webpage(self, video_id):
1810                 """Report webpage download."""
1811                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1812
1813         def report_extraction(self, video_id):
1814                 """Report information extraction."""
1815                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1816
1817         def _real_initialize(self):
1818                 return
1819
1820         def _real_extract(self, url, new_video=True):
1821                 # Extract ID from URL
1822                 mobj = re.match(self._VALID_URL, url)
1823                 if mobj is None:
1824                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1825                         return
1826
1827                 # At this point we have a new video
1828                 self._downloader.increment_downloads()
1829                 video_id = mobj.group(2)
1830                 video_extension = 'flv'
1831
1832                 # Rewrite valid but non-extractable URLs as
1833                 # extractable English language /watch/ URLs
1834                 if re.match(self._VPAGE_URL, url) is None:
1835                         request = urllib2.Request(url)
1836                         try:
1837                                 webpage = urllib2.urlopen(request).read()
1838                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1839                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1840                                 return
1841
1842                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1843                         if mobj is None:
1844                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1845                                 return
1846                         yahoo_id = mobj.group(1)
1847
1848                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1849                         if mobj is None:
1850                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1851                                 return
1852                         yahoo_vid = mobj.group(1)
1853
1854                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1855                         return self._real_extract(url, new_video=False)
1856
1857                 # Retrieve video webpage to extract further information
1858                 request = urllib2.Request(url)
1859                 try:
1860                         self.report_download_webpage(video_id)
1861                         webpage = urllib2.urlopen(request).read()
1862                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1863                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1864                         return
1865
1866                 # Extract uploader and title from webpage
1867                 self.report_extraction(video_id)
1868                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1869                 if mobj is None:
1870                         self._downloader.trouble(u'ERROR: unable to extract video title')
1871                         return
1872                 video_title = mobj.group(1).decode('utf-8')
1873                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1874
1875                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1876                 if mobj is None:
1877                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1878                         return
1879                 video_uploader = mobj.group(1).decode('utf-8')
1880
1881                 # Extract video thumbnail
1882                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1883                 if mobj is None:
1884                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1885                         return
1886                 video_thumbnail = mobj.group(1).decode('utf-8')
1887
1888                 # Extract video description
1889                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1890                 if mobj is None:
1891                         self._downloader.trouble(u'ERROR: unable to extract video description')
1892                         return
1893                 video_description = mobj.group(1).decode('utf-8')
1894                 if not video_description:
1895                         video_description = 'No description available.'
1896
1897                 # Extract video height and width
1898                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1899                 if mobj is None:
1900                         self._downloader.trouble(u'ERROR: unable to extract video height')
1901                         return
1902                 yv_video_height = mobj.group(1)
1903
1904                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1905                 if mobj is None:
1906                         self._downloader.trouble(u'ERROR: unable to extract video width')
1907                         return
1908                 yv_video_width = mobj.group(1)
1909
1910                 # Retrieve video playlist to extract media URL
1911                 # I'm not completely sure what all these options are, but we
1912                 # seem to need most of them, otherwise the server sends a 401.
1913                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1914                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1915                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1916                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1917                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1918                 try:
1919                         self.report_download_webpage(video_id)
1920                         webpage = urllib2.urlopen(request).read()
1921                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1922                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1923                         return
1924
1925                 # Extract media URL from playlist XML
1926                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1927                 if mobj is None:
1928                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1929                         return
1930                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1931                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1932
1933                 try:
1934                         # Process video information
1935                         self._downloader.process_info({
1936                                 'id':           video_id.decode('utf-8'),
1937                                 'url':          video_url,
1938                                 'uploader':     video_uploader,
1939                                 'upload_date':  u'NA',
1940                                 'title':        video_title,
1941                                 'stitle':       simple_title,
1942                                 'ext':          video_extension.decode('utf-8'),
1943                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1944                                 'description':  video_description,
1945                                 'thumbnail':    video_thumbnail,
1946                                 'player_url':   None,
1947                         })
1948                 except UnavailableVideoError:
1949                         self._downloader.trouble(u'\nERROR: unable to download video')
1950
1951
1952 class VimeoIE(InfoExtractor):
1953         """Information extractor for vimeo.com."""
1954
1955         # _VALID_URL matches Vimeo URLs
1956         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1957         IE_NAME = u'vimeo'
1958
1959         def __init__(self, downloader=None):
1960                 InfoExtractor.__init__(self, downloader)
1961
1962         def report_download_webpage(self, video_id):
1963                 """Report webpage download."""
1964                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1965
1966         def report_extraction(self, video_id):
1967                 """Report information extraction."""
1968                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1969
1970         def _real_initialize(self):
1971                 return
1972
1973         def _real_extract(self, url, new_video=True):
1974                 # Extract ID from URL
1975                 mobj = re.match(self._VALID_URL, url)
1976                 if mobj is None:
1977                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1978                         return
1979
1980                 # At this point we have a new video
1981                 self._downloader.increment_downloads()
1982                 video_id = mobj.group(1)
1983
1984                 # Retrieve video webpage to extract further information
1985                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1986                 try:
1987                         self.report_download_webpage(video_id)
1988                         webpage = urllib2.urlopen(request).read()
1989                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1990                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1991                         return
1992
1993                 # Now we begin extracting as much information as we can from what we
1994                 # retrieved. First we extract the information common to all extractors,
1995                 # and latter we extract those that are Vimeo specific.
1996                 self.report_extraction(video_id)
1997
1998                 # Extract title
1999                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2000                 if mobj is None:
2001                         self._downloader.trouble(u'ERROR: unable to extract video title')
2002                         return
2003                 video_title = mobj.group(1).decode('utf-8')
2004                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2005
2006                 # Extract uploader
2007                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2008                 if mobj is None:
2009                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2010                         return
2011                 video_uploader = mobj.group(1).decode('utf-8')
2012
2013                 # Extract video thumbnail
2014                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2015                 if mobj is None:
2016                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2017                         return
2018                 video_thumbnail = mobj.group(1).decode('utf-8')
2019
2020                 # # Extract video description
2021                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2022                 # if mobj is None:
2023                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2024                 #       return
2025                 # video_description = mobj.group(1).decode('utf-8')
2026                 # if not video_description: video_description = 'No description available.'
2027                 video_description = 'Foo.'
2028
2029                 # Vimeo specific: extract request signature
2030                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2031                 if mobj is None:
2032                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2033                         return
2034                 sig = mobj.group(1).decode('utf-8')
2035
2036                 # Vimeo specific: Extract request signature expiration
2037                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2038                 if mobj is None:
2039                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2040                         return
2041                 sig_exp = mobj.group(1).decode('utf-8')
2042
2043                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2044
2045                 try:
2046                         # Process video information
2047                         self._downloader.process_info({
2048                                 'id':           video_id.decode('utf-8'),
2049                                 'url':          video_url,
2050                                 'uploader':     video_uploader,
2051                                 'upload_date':  u'NA',
2052                                 'title':        video_title,
2053                                 'stitle':       simple_title,
2054                                 'ext':          u'mp4',
2055                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2056                                 'description':  video_description,
2057                                 'thumbnail':    video_thumbnail,
2058                                 'description':  video_description,
2059                                 'player_url':   None,
2060                         })
2061                 except UnavailableVideoError:
2062                         self._downloader.trouble(u'ERROR: unable to download video')
2063
2064
2065 class GenericIE(InfoExtractor):
2066         """Generic last-resort information extractor."""
2067
2068         _VALID_URL = r'.*'
2069         IE_NAME = u'generic'
2070
2071         def __init__(self, downloader=None):
2072                 InfoExtractor.__init__(self, downloader)
2073
2074         def report_download_webpage(self, video_id):
2075                 """Report webpage download."""
2076                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2077                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2078
2079         def report_extraction(self, video_id):
2080                 """Report information extraction."""
2081                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2082
2083         def _real_initialize(self):
2084                 return
2085
2086         def _real_extract(self, url):
2087                 # At this point we have a new video
2088                 self._downloader.increment_downloads()
2089
2090                 video_id = url.split('/')[-1]
2091                 request = urllib2.Request(url)
2092                 try:
2093                         self.report_download_webpage(video_id)
2094                         webpage = urllib2.urlopen(request).read()
2095                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2096                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2097                         return
2098                 except ValueError, err:
2099                         # since this is the last-resort InfoExtractor, if
2100                         # this error is thrown, it'll be thrown here
2101                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2102                         return
2103
2104                 self.report_extraction(video_id)
2105                 # Start with something easy: JW Player in SWFObject
2106                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2107                 if mobj is None:
2108                         # Broaden the search a little bit
2109                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2110                 if mobj is None:
2111                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2112                         return
2113
2114                 # It's possible that one of the regexes
2115                 # matched, but returned an empty group:
2116                 if mobj.group(1) is None:
2117                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2118                         return
2119
2120                 video_url = urllib.unquote(mobj.group(1))
2121                 video_id = os.path.basename(video_url)
2122
2123                 # here's a fun little line of code for you:
2124                 video_extension = os.path.splitext(video_id)[1][1:]
2125                 video_id = os.path.splitext(video_id)[0]
2126
2127                 # it's tempting to parse this further, but you would
2128                 # have to take into account all the variations like
2129                 #   Video Title - Site Name
2130                 #   Site Name | Video Title
2131                 #   Video Title - Tagline | Site Name
2132                 # and so on and so forth; it's just not practical
2133                 mobj = re.search(r'<title>(.*)</title>', webpage)
2134                 if mobj is None:
2135                         self._downloader.trouble(u'ERROR: unable to extract title')
2136                         return
2137                 video_title = mobj.group(1).decode('utf-8')
2138                 video_title = sanitize_title(video_title)
2139                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2140
2141                 # video uploader is domain name
2142                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2143                 if mobj is None:
2144                         self._downloader.trouble(u'ERROR: unable to extract title')
2145                         return
2146                 video_uploader = mobj.group(1).decode('utf-8')
2147
2148                 try:
2149                         # Process video information
2150                         self._downloader.process_info({
2151                                 'id':           video_id.decode('utf-8'),
2152                                 'url':          video_url.decode('utf-8'),
2153                                 'uploader':     video_uploader,
2154                                 'upload_date':  u'NA',
2155                                 'title':        video_title,
2156                                 'stitle':       simple_title,
2157                                 'ext':          video_extension.decode('utf-8'),
2158                                 'format':       u'NA',
2159                                 'player_url':   None,
2160                         })
2161                 except UnavailableVideoError, err:
2162                         self._downloader.trouble(u'\nERROR: unable to download video')
2163
2164
2165 class YoutubeSearchIE(InfoExtractor):
2166         """Information Extractor for YouTube search queries."""
2167         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2168         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2169         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2170         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2171         _youtube_ie = None
2172         _max_youtube_results = 1000
2173         IE_NAME = u'youtube:search'
2174
2175         def __init__(self, youtube_ie, downloader=None):
2176                 InfoExtractor.__init__(self, downloader)
2177                 self._youtube_ie = youtube_ie
2178
2179         def report_download_page(self, query, pagenum):
2180                 """Report attempt to download playlist page with given number."""
2181                 query = query.decode(preferredencoding())
2182                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2183
2184         def _real_initialize(self):
2185                 self._youtube_ie.initialize()
2186
2187         def _real_extract(self, query):
2188                 mobj = re.match(self._VALID_URL, query)
2189                 if mobj is None:
2190                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2191                         return
2192
2193                 prefix, query = query.split(':')
2194                 prefix = prefix[8:]
2195                 query = query.encode('utf-8')
2196                 if prefix == '':
2197                         self._download_n_results(query, 1)
2198                         return
2199                 elif prefix == 'all':
2200                         self._download_n_results(query, self._max_youtube_results)
2201                         return
2202                 else:
2203                         try:
2204                                 n = long(prefix)
2205                                 if n <= 0:
2206                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2207                                         return
2208                                 elif n > self._max_youtube_results:
2209                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2210                                         n = self._max_youtube_results
2211                                 self._download_n_results(query, n)
2212                                 return
2213                         except ValueError: # parsing prefix as integer fails
2214                                 self._download_n_results(query, 1)
2215                                 return
2216
2217         def _download_n_results(self, query, n):
2218                 """Downloads a specified number of results for a query"""
2219
2220                 video_ids = []
2221                 already_seen = set()
2222                 pagenum = 1
2223
2224                 while True:
2225                         self.report_download_page(query, pagenum)
2226                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2227                         request = urllib2.Request(result_url)
2228                         try:
2229                                 page = urllib2.urlopen(request).read()
2230                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2231                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2232                                 return
2233
2234                         # Extract video identifiers
2235                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2236                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2237                                 if video_id not in already_seen:
2238                                         video_ids.append(video_id)
2239                                         already_seen.add(video_id)
2240                                         if len(video_ids) == n:
2241                                                 # Specified n videos reached
2242                                                 for id in video_ids:
2243                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2244                                                 return
2245
2246                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2247                                 for id in video_ids:
2248                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2249                                 return
2250
2251                         pagenum = pagenum + 1
2252
2253
2254 class GoogleSearchIE(InfoExtractor):
2255         """Information Extractor for Google Video search queries."""
2256         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2257         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2258         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2259         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2260         _google_ie = None
2261         _max_google_results = 1000
2262         IE_NAME = u'video.google:search'
2263
2264         def __init__(self, google_ie, downloader=None):
2265                 InfoExtractor.__init__(self, downloader)
2266                 self._google_ie = google_ie
2267
2268         def report_download_page(self, query, pagenum):
2269                 """Report attempt to download playlist page with given number."""
2270                 query = query.decode(preferredencoding())
2271                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2272
2273         def _real_initialize(self):
2274                 self._google_ie.initialize()
2275
2276         def _real_extract(self, query):
2277                 mobj = re.match(self._VALID_URL, query)
2278                 if mobj is None:
2279                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2280                         return
2281
2282                 prefix, query = query.split(':')
2283                 prefix = prefix[8:]
2284                 query = query.encode('utf-8')
2285                 if prefix == '':
2286                         self._download_n_results(query, 1)
2287                         return
2288                 elif prefix == 'all':
2289                         self._download_n_results(query, self._max_google_results)
2290                         return
2291                 else:
2292                         try:
2293                                 n = long(prefix)
2294                                 if n <= 0:
2295                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2296                                         return
2297                                 elif n > self._max_google_results:
2298                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2299                                         n = self._max_google_results
2300                                 self._download_n_results(query, n)
2301                                 return
2302                         except ValueError: # parsing prefix as integer fails
2303                                 self._download_n_results(query, 1)
2304                                 return
2305
2306         def _download_n_results(self, query, n):
2307                 """Downloads a specified number of results for a query"""
2308
2309                 video_ids = []
2310                 already_seen = set()
2311                 pagenum = 1
2312
2313                 while True:
2314                         self.report_download_page(query, pagenum)
2315                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2316                         request = urllib2.Request(result_url)
2317                         try:
2318                                 page = urllib2.urlopen(request).read()
2319                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2320                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2321                                 return
2322
2323                         # Extract video identifiers
2324                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2325                                 video_id = mobj.group(1)
2326                                 if video_id not in already_seen:
2327                                         video_ids.append(video_id)
2328                                         already_seen.add(video_id)
2329                                         if len(video_ids) == n:
2330                                                 # Specified n videos reached
2331                                                 for id in video_ids:
2332                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2333                                                 return
2334
2335                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2336                                 for id in video_ids:
2337                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2338                                 return
2339
2340                         pagenum = pagenum + 1
2341
2342
2343 class YahooSearchIE(InfoExtractor):
2344         """Information Extractor for Yahoo! Video search queries."""
2345         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2346         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2347         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2348         _MORE_PAGES_INDICATOR = r'\s*Next'
2349         _yahoo_ie = None
2350         _max_yahoo_results = 1000
2351         IE_NAME = u'video.yahoo:search'
2352
2353         def __init__(self, yahoo_ie, downloader=None):
2354                 InfoExtractor.__init__(self, downloader)
2355                 self._yahoo_ie = yahoo_ie
2356
2357         def report_download_page(self, query, pagenum):
2358                 """Report attempt to download playlist page with given number."""
2359                 query = query.decode(preferredencoding())
2360                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2361
2362         def _real_initialize(self):
2363                 self._yahoo_ie.initialize()
2364
2365         def _real_extract(self, query):
2366                 mobj = re.match(self._VALID_URL, query)
2367                 if mobj is None:
2368                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2369                         return
2370
2371                 prefix, query = query.split(':')
2372                 prefix = prefix[8:]
2373                 query = query.encode('utf-8')
2374                 if prefix == '':
2375                         self._download_n_results(query, 1)
2376                         return
2377                 elif prefix == 'all':
2378                         self._download_n_results(query, self._max_yahoo_results)
2379                         return
2380                 else:
2381                         try:
2382                                 n = long(prefix)
2383                                 if n <= 0:
2384                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2385                                         return
2386                                 elif n > self._max_yahoo_results:
2387                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2388                                         n = self._max_yahoo_results
2389                                 self._download_n_results(query, n)
2390                                 return
2391                         except ValueError: # parsing prefix as integer fails
2392                                 self._download_n_results(query, 1)
2393                                 return
2394
2395         def _download_n_results(self, query, n):
2396                 """Downloads a specified number of results for a query"""
2397
2398                 video_ids = []
2399                 already_seen = set()
2400                 pagenum = 1
2401
2402                 while True:
2403                         self.report_download_page(query, pagenum)
2404                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2405                         request = urllib2.Request(result_url)
2406                         try:
2407                                 page = urllib2.urlopen(request).read()
2408                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2409                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2410                                 return
2411
2412                         # Extract video identifiers
2413                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2414                                 video_id = mobj.group(1)
2415                                 if video_id not in already_seen:
2416                                         video_ids.append(video_id)
2417                                         already_seen.add(video_id)
2418                                         if len(video_ids) == n:
2419                                                 # Specified n videos reached
2420                                                 for id in video_ids:
2421                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2422                                                 return
2423
2424                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2425                                 for id in video_ids:
2426                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2427                                 return
2428
2429                         pagenum = pagenum + 1
2430
2431
2432 class YoutubePlaylistIE(InfoExtractor):
2433         """Information Extractor for YouTube playlists."""
2434
2435         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2436         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2437         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2438         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2439         _youtube_ie = None
2440         IE_NAME = u'youtube:playlist'
2441
2442         def __init__(self, youtube_ie, downloader=None):
2443                 InfoExtractor.__init__(self, downloader)
2444                 self._youtube_ie = youtube_ie
2445
2446         def report_download_page(self, playlist_id, pagenum):
2447                 """Report attempt to download playlist page with given number."""
2448                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2449
2450         def _real_initialize(self):
2451                 self._youtube_ie.initialize()
2452
2453         def _real_extract(self, url):
2454                 # Extract playlist id
2455                 mobj = re.match(self._VALID_URL, url)
2456                 if mobj is None:
2457                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2458                         return
2459
2460                 # Single video case
2461                 if mobj.group(3) is not None:
2462                         self._youtube_ie.extract(mobj.group(3))
2463                         return
2464
2465                 # Download playlist pages
2466                 # prefix is 'p' as default for playlists but there are other types that need extra care
2467                 playlist_prefix = mobj.group(1)
2468                 if playlist_prefix == 'a':
2469                         playlist_access = 'artist'
2470                 else:
2471                         playlist_prefix = 'p'
2472                         playlist_access = 'view_play_list'
2473                 playlist_id = mobj.group(2)
2474                 video_ids = []
2475                 pagenum = 1
2476
2477                 while True:
2478                         self.report_download_page(playlist_id, pagenum)
2479                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2480                         try:
2481                                 page = urllib2.urlopen(request).read()
2482                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2483                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2484                                 return
2485
2486                         # Extract video identifiers
2487                         ids_in_page = []
2488                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2489                                 if mobj.group(1) not in ids_in_page:
2490                                         ids_in_page.append(mobj.group(1))
2491                         video_ids.extend(ids_in_page)
2492
2493                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2494                                 break
2495                         pagenum = pagenum + 1
2496
2497                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2498                 playlistend = self._downloader.params.get('playlistend', -1)
2499                 video_ids = video_ids[playliststart:playlistend]
2500
2501                 for id in video_ids:
2502                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2503                 return
2504
2505
2506 class YoutubeUserIE(InfoExtractor):
2507         """Information Extractor for YouTube users."""
2508
2509         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2510         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2511         _GDATA_PAGE_SIZE = 50
2512         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2513         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2514         _youtube_ie = None
2515         IE_NAME = u'youtube:user'
2516
2517         def __init__(self, youtube_ie, downloader=None):
2518                 InfoExtractor.__init__(self, downloader)
2519                 self._youtube_ie = youtube_ie
2520
2521         def report_download_page(self, username, start_index):
2522                 """Report attempt to download user page."""
2523                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2524                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2525
2526         def _real_initialize(self):
2527                 self._youtube_ie.initialize()
2528
2529         def _real_extract(self, url):
2530                 # Extract username
2531                 mobj = re.match(self._VALID_URL, url)
2532                 if mobj is None:
2533                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2534                         return
2535
2536                 username = mobj.group(1)
2537
2538                 # Download video ids using YouTube Data API. Result size per
2539                 # query is limited (currently to 50 videos) so we need to query
2540                 # page by page until there are no video ids - it means we got
2541                 # all of them.
2542
2543                 video_ids = []
2544                 pagenum = 0
2545
2546                 while True:
2547                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2548                         self.report_download_page(username, start_index)
2549
2550                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2551
2552                         try:
2553                                 page = urllib2.urlopen(request).read()
2554                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2555                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2556                                 return
2557
2558                         # Extract video identifiers
2559                         ids_in_page = []
2560
2561                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2562                                 if mobj.group(1) not in ids_in_page:
2563                                         ids_in_page.append(mobj.group(1))
2564
2565                         video_ids.extend(ids_in_page)
2566
2567                         # A little optimization - if current page is not
2568                         # "full", ie. does not contain PAGE_SIZE video ids then
2569                         # we can assume that this page is the last one - there
2570                         # are no more ids on further pages - no need to query
2571                         # again.
2572
2573                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2574                                 break
2575
2576                         pagenum += 1
2577
2578                 all_ids_count = len(video_ids)
2579                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2580                 playlistend = self._downloader.params.get('playlistend', -1)
2581
2582                 if playlistend == -1:
2583                         video_ids = video_ids[playliststart:]
2584                 else:
2585                         video_ids = video_ids[playliststart:playlistend]
2586
2587                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2588                                 (username, all_ids_count, len(video_ids)))
2589
2590                 for video_id in video_ids:
2591                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2592
2593
2594 class DepositFilesIE(InfoExtractor):
2595         """Information extractor for depositfiles.com"""
2596
2597         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2598         IE_NAME = u'DepositFiles'
2599
2600         def __init__(self, downloader=None):
2601                 InfoExtractor.__init__(self, downloader)
2602
2603         def report_download_webpage(self, file_id):
2604                 """Report webpage download."""
2605                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2606
2607         def report_extraction(self, file_id):
2608                 """Report information extraction."""
2609                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2610
2611         def _real_initialize(self):
2612                 return
2613
2614         def _real_extract(self, url):
2615                 # At this point we have a new file
2616                 self._downloader.increment_downloads()
2617
2618                 file_id = url.split('/')[-1]
2619                 # Rebuild url in english locale
2620                 url = 'http://depositfiles.com/en/files/' + file_id
2621
2622                 # Retrieve file webpage with 'Free download' button pressed
2623                 free_download_indication = { 'gateway_result' : '1' }
2624                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2625                 try:
2626                         self.report_download_webpage(file_id)
2627                         webpage = urllib2.urlopen(request).read()
2628                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2629                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2630                         return
2631
2632                 # Search for the real file URL
2633                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2634                 if (mobj is None) or (mobj.group(1) is None):
2635                         # Try to figure out reason of the error.
2636                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2637                         if (mobj is not None) and (mobj.group(1) is not None):
2638                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2639                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2640                         else:
2641                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2642                         return
2643
2644                 file_url = mobj.group(1)
2645                 file_extension = os.path.splitext(file_url)[1][1:]
2646
2647                 # Search for file title
2648                 mobj = re.search(r'<b title="(.*?)">', webpage)
2649                 if mobj is None:
2650                         self._downloader.trouble(u'ERROR: unable to extract title')
2651                         return
2652                 file_title = mobj.group(1).decode('utf-8')
2653
2654                 try:
2655                         # Process file information
2656                         self._downloader.process_info({
2657                                 'id':           file_id.decode('utf-8'),
2658                                 'url':          file_url.decode('utf-8'),
2659                                 'uploader':     u'NA',
2660                                 'upload_date':  u'NA',
2661                                 'title':        file_title,
2662                                 'stitle':       file_title,
2663                                 'ext':          file_extension.decode('utf-8'),
2664                                 'format':       u'NA',
2665                                 'player_url':   None,
2666                         })
2667                 except UnavailableVideoError, err:
2668                         self._downloader.trouble(u'ERROR: unable to download file')
2669
2670
2671 class FacebookIE(InfoExtractor):
2672         """Information Extractor for Facebook"""
2673
2674         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2675         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2676         _NETRC_MACHINE = 'facebook'
2677         _available_formats = ['highqual', 'lowqual']
2678         _video_extensions = {
2679                 'highqual': 'mp4',
2680                 'lowqual': 'mp4',
2681         }
2682         IE_NAME = u'facebook'
2683
2684         def __init__(self, downloader=None):
2685                 InfoExtractor.__init__(self, downloader)
2686
2687         def _reporter(self, message):
2688                 """Add header and report message."""
2689                 self._downloader.to_screen(u'[facebook] %s' % message)
2690
2691         def report_login(self):
2692                 """Report attempt to log in."""
2693                 self._reporter(u'Logging in')
2694
2695         def report_video_webpage_download(self, video_id):
2696                 """Report attempt to download video webpage."""
2697                 self._reporter(u'%s: Downloading video webpage' % video_id)
2698
2699         def report_information_extraction(self, video_id):
2700                 """Report attempt to extract video information."""
2701                 self._reporter(u'%s: Extracting video information' % video_id)
2702
2703         def _parse_page(self, video_webpage):
2704                 """Extract video information from page"""
2705                 # General data
2706                 data = {'title': r'class="video_title datawrap">(.*?)</',
2707                         'description': r'<div class="datawrap">(.*?)</div>',
2708                         'owner': r'\("video_owner_name", "(.*?)"\)',
2709                         'upload_date': r'data-date="(.*?)"',
2710                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2711                         }
2712                 video_info = {}
2713                 for piece in data.keys():
2714                         mobj = re.search(data[piece], video_webpage)
2715                         if mobj is not None:
2716                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2717
2718                 # Video urls
2719                 video_urls = {}
2720                 for fmt in self._available_formats:
2721                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2722                         if mobj is not None:
2723                                 # URL is in a Javascript segment inside an escaped Unicode format within
2724                                 # the generally utf-8 page
2725                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2726                 video_info['video_urls'] = video_urls
2727
2728                 return video_info
2729
2730         def _real_initialize(self):
2731                 if self._downloader is None:
2732                         return
2733
2734                 useremail = None
2735                 password = None
2736                 downloader_params = self._downloader.params
2737
2738                 # Attempt to use provided username and password or .netrc data
2739                 if downloader_params.get('username', None) is not None:
2740                         useremail = downloader_params['username']
2741                         password = downloader_params['password']
2742                 elif downloader_params.get('usenetrc', False):
2743                         try:
2744                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2745                                 if info is not None:
2746                                         useremail = info[0]
2747                                         password = info[2]
2748                                 else:
2749                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2750                         except (IOError, netrc.NetrcParseError), err:
2751                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2752                                 return
2753
2754                 if useremail is None:
2755                         return
2756
2757                 # Log in
2758                 login_form = {
2759                         'email': useremail,
2760                         'pass': password,
2761                         'login': 'Log+In'
2762                         }
2763                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2764                 try:
2765                         self.report_login()
2766                         login_results = urllib2.urlopen(request).read()
2767                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2768                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2769                                 return
2770                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2771                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2772                         return
2773
2774         def _real_extract(self, url):
2775                 mobj = re.match(self._VALID_URL, url)
2776                 if mobj is None:
2777                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2778                         return
2779                 video_id = mobj.group('ID')
2780
2781                 # Get video webpage
2782                 self.report_video_webpage_download(video_id)
2783                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2784                 try:
2785                         page = urllib2.urlopen(request)
2786                         video_webpage = page.read()
2787                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2788                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2789                         return
2790
2791                 # Start extracting information
2792                 self.report_information_extraction(video_id)
2793
2794                 # Extract information
2795                 video_info = self._parse_page(video_webpage)
2796
2797                 # uploader
2798                 if 'owner' not in video_info:
2799                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2800                         return
2801                 video_uploader = video_info['owner']
2802
2803                 # title
2804                 if 'title' not in video_info:
2805                         self._downloader.trouble(u'ERROR: unable to extract video title')
2806                         return
2807                 video_title = video_info['title']
2808                 video_title = video_title.decode('utf-8')
2809                 video_title = sanitize_title(video_title)
2810
2811                 # simplified title
2812                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2813                 simple_title = simple_title.strip(ur'_')
2814
2815                 # thumbnail image
2816                 if 'thumbnail' not in video_info:
2817                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2818                         video_thumbnail = ''
2819                 else:
2820                         video_thumbnail = video_info['thumbnail']
2821
2822                 # upload date
2823                 upload_date = u'NA'
2824                 if 'upload_date' in video_info:
2825                         upload_time = video_info['upload_date']
2826                         timetuple = email.utils.parsedate_tz(upload_time)
2827                         if timetuple is not None:
2828                                 try:
2829                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2830                                 except:
2831                                         pass
2832
2833                 # description
2834                 video_description = video_info.get('description', 'No description available.')
2835
2836                 url_map = video_info['video_urls']
2837                 if len(url_map.keys()) > 0:
2838                         # Decide which formats to download
2839                         req_format = self._downloader.params.get('format', None)
2840                         format_limit = self._downloader.params.get('format_limit', None)
2841
2842                         if format_limit is not None and format_limit in self._available_formats:
2843                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2844                         else:
2845                                 format_list = self._available_formats
2846                         existing_formats = [x for x in format_list if x in url_map]
2847                         if len(existing_formats) == 0:
2848                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2849                                 return
2850                         if req_format is None:
2851                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2852                         elif req_format == 'worst':
2853                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2854                         elif req_format == '-1':
2855                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2856                         else:
2857                                 # Specific format
2858                                 if req_format not in url_map:
2859                                         self._downloader.trouble(u'ERROR: requested format not available')
2860                                         return
2861                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2862
2863                 for format_param, video_real_url in video_url_list:
2864
2865                         # At this point we have a new video
2866                         self._downloader.increment_downloads()
2867
2868                         # Extension
2869                         video_extension = self._video_extensions.get(format_param, 'mp4')
2870
2871                         try:
2872                                 # Process video information
2873                                 self._downloader.process_info({
2874                                         'id':           video_id.decode('utf-8'),
2875                                         'url':          video_real_url.decode('utf-8'),
2876                                         'uploader':     video_uploader.decode('utf-8'),
2877                                         'upload_date':  upload_date,
2878                                         'title':        video_title,
2879                                         'stitle':       simple_title,
2880                                         'ext':          video_extension.decode('utf-8'),
2881                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2882                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2883                                         'description':  video_description.decode('utf-8'),
2884                                         'player_url':   None,
2885                                 })
2886                         except UnavailableVideoError, err:
2887                                 self._downloader.trouble(u'\nERROR: unable to download video')
2888
2889 class BlipTVIE(InfoExtractor):
2890         """Information extractor for blip.tv"""
2891
2892         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2893         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2894         IE_NAME = u'blip.tv'
2895
2896         def report_extraction(self, file_id):
2897                 """Report information extraction."""
2898                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2899
2900         def _simplify_title(self, title):
2901                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2902                 res = res.strip(ur'_')
2903                 return res
2904
2905         def _real_extract(self, url):
2906                 mobj = re.match(self._VALID_URL, url)
2907                 if mobj is None:
2908                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2909                         return
2910
2911                 if '?' in url:
2912                         cchar = '&'
2913                 else:
2914                         cchar = '?'
2915                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2916                 request = urllib2.Request(json_url)
2917                 self.report_extraction(mobj.group(1))
2918                 try:
2919                         json_code = urllib2.urlopen(request).read()
2920                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2921                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2922                         return
2923                 try:
2924                         json_data = json.loads(json_code)
2925                         if 'Post' in json_data:
2926                                 data = json_data['Post']
2927                         else:
2928                                 data = json_data
2929
2930                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2931                         video_url = data['media']['url']
2932                         umobj = re.match(self._URL_EXT, video_url)
2933                         if umobj is None:
2934                                 raise ValueError('Can not determine filename extension')
2935                         ext = umobj.group(1)
2936
2937                         self._downloader.increment_downloads()
2938
2939                         info = {
2940                                 'id': data['item_id'],
2941                                 'url': video_url,
2942                                 'uploader': data['display_name'],
2943                                 'upload_date': upload_date,
2944                                 'title': data['title'],
2945                                 'stitle': self._simplify_title(data['title']),
2946                                 'ext': ext,
2947                                 'format': data['media']['mimeType'],
2948                                 'thumbnail': data['thumbnailUrl'],
2949                                 'description': data['description'],
2950                                 'player_url': data['embedUrl']
2951                         }
2952                 except (ValueError,KeyError), err:
2953                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2954                         return
2955
2956                 try:
2957                         self._downloader.process_info(info)
2958                 except UnavailableVideoError, err:
2959                         self._downloader.trouble(u'\nERROR: unable to download video')
2960
2961
2962 class MyVideoIE(InfoExtractor):
2963         """Information Extractor for myvideo.de."""
2964
2965         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2966         IE_NAME = u'myvideo'
2967
2968         def __init__(self, downloader=None):
2969                 InfoExtractor.__init__(self, downloader)
2970         
2971         def report_download_webpage(self, video_id):
2972                 """Report webpage download."""
2973                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2974
2975         def report_extraction(self, video_id):
2976                 """Report information extraction."""
2977                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2978
2979         def _real_initialize(self):
2980                 return
2981
2982         def _real_extract(self,url):
2983                 mobj = re.match(self._VALID_URL, url)
2984                 if mobj is None:
2985                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2986                         return
2987
2988                 video_id = mobj.group(1)
2989                 simple_title = mobj.group(2).decode('utf-8')
2990                 # should actually not be necessary
2991                 simple_title = sanitize_title(simple_title)
2992                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2993
2994                 # Get video webpage
2995                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2996                 try:
2997                         self.report_download_webpage(video_id)
2998                         webpage = urllib2.urlopen(request).read()
2999                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3000                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3001                         return
3002
3003                 self.report_extraction(video_id)
3004                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3005                                  webpage)
3006                 if mobj is None:
3007                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3008                         return
3009                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3010
3011                 mobj = re.search('<title>([^<]+)</title>', webpage)
3012                 if mobj is None:
3013                         self._downloader.trouble(u'ERROR: unable to extract title')
3014                         return
3015
3016                 video_title = mobj.group(1)
3017                 video_title = sanitize_title(video_title)
3018
3019                 try:
3020                         print(video_url)
3021                         self._downloader.process_info({
3022                                 'id':           video_id,
3023                                 'url':          video_url,
3024                                 'uploader':     u'NA',
3025                                 'upload_date':  u'NA',
3026                                 'title':        video_title,
3027                                 'stitle':       simple_title,
3028                                 'ext':          u'flv',
3029                                 'format':       u'NA',
3030                                 'player_url':   None,
3031                         })
3032                 except UnavailableVideoError:
3033                         self._downloader.trouble(u'\nERROR: Unable to download video')
3034
3035 class ComedyCentralIE(InfoExtractor):
3036         """Information extractor for The Daily Show and Colbert Report """
3037
3038         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3039         IE_NAME = u'comedycentral'
3040
3041         def report_extraction(self, episode_id):
3042                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3043         
3044         def report_config_download(self, episode_id):
3045                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3046
3047         def report_index_download(self, episode_id):
3048                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3049
3050         def report_player_url(self, episode_id):
3051                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3052
3053         def _simplify_title(self, title):
3054                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3055                 res = res.strip(ur'_')
3056                 return res
3057
3058         def _real_extract(self, url):
3059                 mobj = re.match(self._VALID_URL, url)
3060                 if mobj is None:
3061                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3062                         return
3063
3064                 if mobj.group('shortname'):
3065                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3066                                 url = 'http://www.thedailyshow.com/full-episodes/'
3067                         else:
3068                                 url = 'http://www.colbertnation.com/full-episodes/'
3069                         mobj = re.match(self._VALID_URL, url)
3070                         assert mobj is not None
3071
3072                 dlNewest = not mobj.group('episode')
3073                 if dlNewest:
3074                         epTitle = mobj.group('showname')
3075                 else:
3076                         epTitle = mobj.group('episode')
3077
3078                 req = urllib2.Request(url)
3079                 self.report_extraction(epTitle)
3080                 try:
3081                         htmlHandle = urllib2.urlopen(req)
3082                         html = htmlHandle.read()
3083                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3084                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3085                         return
3086                 if dlNewest:
3087                         url = htmlHandle.geturl()
3088                         mobj = re.match(self._VALID_URL, url)
3089                         if mobj is None:
3090                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3091                                 return
3092                         if mobj.group('episode') == '':
3093                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3094                                 return
3095                         epTitle = mobj.group('episode')
3096
3097                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3098                 if len(mMovieParams) == 0:
3099                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3100                         return
3101
3102                 playerUrl_raw = mMovieParams[0][0]
3103                 self.report_player_url(epTitle)
3104                 try:
3105                         urlHandle = urllib2.urlopen(playerUrl_raw)
3106                         playerUrl = urlHandle.geturl()
3107                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3108                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3109                         return
3110
3111                 uri = mMovieParams[0][1]
3112                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3113                 self.report_index_download(epTitle)
3114                 try:
3115                         indexXml = urllib2.urlopen(indexUrl).read()
3116                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3117                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3118                         return
3119
3120                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3121                 itemEls = idoc.findall('.//item')
3122                 for itemEl in itemEls:
3123                         mediaId = itemEl.findall('./guid')[0].text
3124                         shortMediaId = mediaId.split(':')[-1]
3125                         showId = mediaId.split(':')[-2].replace('.com', '')
3126                         officialTitle = itemEl.findall('./title')[0].text
3127                         officialDate = itemEl.findall('./pubDate')[0].text
3128
3129                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3130                                                 urllib.urlencode({'uri': mediaId}))
3131                         configReq = urllib2.Request(configUrl)
3132                         self.report_config_download(epTitle)
3133                         try:
3134                                 configXml = urllib2.urlopen(configReq).read()
3135                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3136                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3137                                 return
3138
3139                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3140                         turls = []
3141                         for rendition in cdoc.findall('.//rendition'):
3142                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3143                                 turls.append(finfo)
3144
3145                         if len(turls) == 0:
3146                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3147                                 continue
3148
3149                         # For now, just pick the highest bitrate
3150                         format,video_url = turls[-1]
3151
3152                         self._downloader.increment_downloads()
3153
3154                         effTitle = showId + '-' + epTitle
3155                         info = {
3156                                 'id': shortMediaId,
3157                                 'url': video_url,
3158                                 'uploader': showId,
3159                                 'upload_date': officialDate,
3160                                 'title': effTitle,
3161                                 'stitle': self._simplify_title(effTitle),
3162                                 'ext': 'mp4',
3163                                 'format': format,
3164                                 'thumbnail': None,
3165                                 'description': officialTitle,
3166                                 'player_url': playerUrl
3167                         }
3168
3169                         try:
3170                                 self._downloader.process_info(info)
3171                         except UnavailableVideoError, err:
3172                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3173                                 continue
3174
3175
3176 class EscapistIE(InfoExtractor):
3177         """Information extractor for The Escapist """
3178
3179         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3180         IE_NAME = u'escapist'
3181
3182         def report_extraction(self, showName):
3183                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3184
3185         def report_config_download(self, showName):
3186                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3187
3188         def _simplify_title(self, title):
3189                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3190                 res = res.strip(ur'_')
3191                 return res
3192
3193         def _real_extract(self, url):
3194                 htmlParser = HTMLParser.HTMLParser()
3195
3196                 mobj = re.match(self._VALID_URL, url)
3197                 if mobj is None:
3198                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3199                         return
3200                 showName = mobj.group('showname')
3201                 videoId = mobj.group('episode')
3202
3203                 self.report_extraction(showName)
3204                 try:
3205                         webPage = urllib2.urlopen(url).read()
3206                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3207                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3208                         return
3209
3210                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3211                 description = htmlParser.unescape(descMatch.group(1))
3212                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3213                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3214                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3215                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3216                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3217                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3218
3219                 self.report_config_download(showName)
3220                 try:
3221                         configJSON = urllib2.urlopen(configUrl).read()
3222                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3223                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3224                         return
3225
3226                 # Technically, it's JavaScript, not JSON
3227                 configJSON = configJSON.replace("'", '"')
3228
3229                 try:
3230                         config = json.loads(configJSON)
3231                 except (ValueError,), err:
3232                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3233                         return
3234
3235                 playlist = config['playlist']
3236                 videoUrl = playlist[1]['url']
3237
3238                 self._downloader.increment_downloads()
3239                 info = {
3240                         'id': videoId,
3241                         'url': videoUrl,
3242                         'uploader': showName,
3243                         'upload_date': None,
3244                         'title': showName,
3245                         'stitle': self._simplify_title(showName),
3246                         'ext': 'flv',
3247                         'format': 'flv',
3248                         'thumbnail': imgUrl,
3249                         'description': description,
3250                         'player_url': playerUrl,
3251                 }
3252
3253                 try:
3254                         self._downloader.process_info(info)
3255                 except UnavailableVideoError, err:
3256                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3257
3258
3259
3260 class PostProcessor(object):
3261         """Post Processor class.
3262
3263         PostProcessor objects can be added to downloaders with their
3264         add_post_processor() method. When the downloader has finished a
3265         successful download, it will take its internal chain of PostProcessors
3266         and start calling the run() method on each one of them, first with
3267         an initial argument and then with the returned value of the previous
3268         PostProcessor.
3269
3270         The chain will be stopped if one of them ever returns None or the end
3271         of the chain is reached.
3272
3273         PostProcessor objects follow a "mutual registration" process similar
3274         to InfoExtractor objects.
3275         """
3276
3277         _downloader = None
3278
3279         def __init__(self, downloader=None):
3280                 self._downloader = downloader
3281
3282         def set_downloader(self, downloader):
3283                 """Sets the downloader for this PP."""
3284                 self._downloader = downloader
3285
3286         def run(self, information):
3287                 """Run the PostProcessor.
3288
3289                 The "information" argument is a dictionary like the ones
3290                 composed by InfoExtractors. The only difference is that this
3291                 one has an extra field called "filepath" that points to the
3292                 downloaded file.
3293
3294                 When this method returns None, the postprocessing chain is
3295                 stopped. However, this method may return an information
3296                 dictionary that will be passed to the next postprocessing
3297                 object in the chain. It can be the one it received after
3298                 changing some fields.
3299
3300                 In addition, this method may raise a PostProcessingError
3301                 exception that will be taken into account by the downloader
3302                 it was called from.
3303                 """
3304                 return information # by default, do nothing
3305
3306
3307 class FFmpegExtractAudioPP(PostProcessor):
3308
3309         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3310                 PostProcessor.__init__(self, downloader)
3311                 if preferredcodec is None:
3312                         preferredcodec = 'best'
3313                 self._preferredcodec = preferredcodec
3314                 self._preferredquality = preferredquality
3315                 self._keepvideo = keepvideo
3316
3317         @staticmethod
3318         def get_audio_codec(path):
3319                 try:
3320                         cmd = ['ffprobe', '-show_streams', '--', path]
3321                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3322                         output = handle.communicate()[0]
3323                         if handle.wait() != 0:
3324                                 return None
3325                 except (IOError, OSError):
3326                         return None
3327                 audio_codec = None
3328                 for line in output.split('\n'):
3329                         if line.startswith('codec_name='):
3330                                 audio_codec = line.split('=')[1].strip()
3331                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3332                                 return audio_codec
3333                 return None
3334
3335         @staticmethod
3336         def run_ffmpeg(path, out_path, codec, more_opts):
3337                 try:
3338                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3339                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3340                         return (ret == 0)
3341                 except (IOError, OSError):
3342                         return False
3343
3344         def run(self, information):
3345                 path = information['filepath']
3346
3347                 filecodec = self.get_audio_codec(path)
3348                 if filecodec is None:
3349                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3350                         return None
3351
3352                 more_opts = []
3353                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3354                         if filecodec == 'aac' or filecodec == 'mp3' or filecodec == 'vorbis':
3355                                 # Lossless if possible
3356                                 acodec = 'copy'
3357                                 extension = filecodec
3358                                 if filecodec == 'aac':
3359                                         more_opts = ['-f', 'adts']
3360                                 if filecodec == 'vorbis':
3361                                         extension = 'ogg'
3362                         else:
3363                                 # MP3 otherwise.
3364                                 acodec = 'libmp3lame'
3365                                 extension = 'mp3'
3366                                 more_opts = []
3367                                 if self._preferredquality is not None:
3368                                         more_opts += ['-ab', self._preferredquality]
3369                 else:
3370                         # We convert the audio (lossy)
3371                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3372                         extension = self._preferredcodec
3373                         more_opts = []
3374                         if self._preferredquality is not None:
3375                                 more_opts += ['-ab', self._preferredquality]
3376                         if self._preferredcodec == 'aac':
3377                                 more_opts += ['-f', 'adts']
3378                         if self._preferredcodec == 'vorbis':
3379                                 extension = 'ogg'
3380
3381                 (prefix, ext) = os.path.splitext(path)
3382                 new_path = prefix + '.' + extension
3383                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3384                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3385
3386                 if not status:
3387                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3388                         return None
3389
3390                 # Try to update the date time for extracted audio file.
3391                 if information.get('filetime') is not None:
3392                         try:
3393                                 os.utime(new_path, (time.time(), information['filetime']))
3394                         except:
3395                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3396
3397                 if not self._keepvideo:
3398                         try:
3399                                 os.remove(path)
3400                         except (IOError, OSError):
3401                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3402                                 return None
3403
3404                 information['filepath'] = new_path
3405                 return information
3406
3407
3408 def updateSelf(downloader, filename):
3409         ''' Update the program file with the latest version from the repository '''
3410         # Note: downloader only used for options
3411         if not os.access(filename, os.W_OK):
3412                 sys.exit('ERROR: no write permissions on %s' % filename)
3413
3414         downloader.to_screen('Updating to latest version...')
3415
3416         try:
3417                 try:
3418                         urlh = urllib.urlopen(UPDATE_URL)
3419                         newcontent = urlh.read()
3420                         
3421                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3422                         if vmatch is not None and vmatch.group(1) == __version__:
3423                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3424                                 return
3425                 finally:
3426                         urlh.close()
3427         except (IOError, OSError), err:
3428                 sys.exit('ERROR: unable to download latest version')
3429
3430         try:
3431                 outf = open(filename, 'wb')
3432                 try:
3433                         outf.write(newcontent)
3434                 finally:
3435                         outf.close()
3436         except (IOError, OSError), err:
3437                 sys.exit('ERROR: unable to overwrite current version')
3438
3439         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3440
3441 def parseOpts():
3442         # Deferred imports
3443         import getpass
3444         import optparse
3445
3446         def _format_option_string(option):
3447                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3448
3449                 opts = []
3450
3451                 if option._short_opts: opts.append(option._short_opts[0])
3452                 if option._long_opts: opts.append(option._long_opts[0])
3453                 if len(opts) > 1: opts.insert(1, ', ')
3454
3455                 if option.takes_value(): opts.append(' %s' % option.metavar)
3456
3457                 return "".join(opts)
3458
3459         def _find_term_columns():
3460                 columns = os.environ.get('COLUMNS', None)
3461                 if columns:
3462                         return int(columns)
3463
3464                 try:
3465                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3466                         out,err = sp.communicate()
3467                         return int(out.split()[1])
3468                 except:
3469                         pass
3470                 return None
3471
3472         max_width = 80
3473         max_help_position = 80
3474
3475         # No need to wrap help messages if we're on a wide console
3476         columns = _find_term_columns()
3477         if columns: max_width = columns
3478
3479         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3480         fmt.format_option_strings = _format_option_string
3481
3482         kw = {
3483                 'version'   : __version__,
3484                 'formatter' : fmt,
3485                 'usage' : '%prog [options] url [url...]',
3486                 'conflict_handler' : 'resolve',
3487         }
3488
3489         parser = optparse.OptionParser(**kw)
3490
3491         # option groups
3492         general        = optparse.OptionGroup(parser, 'General Options')
3493         selection      = optparse.OptionGroup(parser, 'Video Selection')
3494         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3495         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3496         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3497         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3498         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3499
3500         general.add_option('-h', '--help',
3501                         action='help', help='print this help text and exit')
3502         general.add_option('-v', '--version',
3503                         action='version', help='print program version and exit')
3504         general.add_option('-U', '--update',
3505                         action='store_true', dest='update_self', help='update this program to latest version')
3506         general.add_option('-i', '--ignore-errors',
3507                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3508         general.add_option('-r', '--rate-limit',
3509                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3510         general.add_option('-R', '--retries',
3511                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3512         general.add_option('--dump-user-agent',
3513                         action='store_true', dest='dump_user_agent',
3514                         help='display the current browser identification', default=False)
3515         general.add_option('--list-extractors',
3516                         action='store_true', dest='list_extractors',
3517                         help='List all supported extractors and the URLs they would handle', default=False)
3518
3519         selection.add_option('--playlist-start',
3520                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3521         selection.add_option('--playlist-end',
3522                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3523         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3524         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3525
3526         authentication.add_option('-u', '--username',
3527                         dest='username', metavar='USERNAME', help='account username')
3528         authentication.add_option('-p', '--password',
3529                         dest='password', metavar='PASSWORD', help='account password')
3530         authentication.add_option('-n', '--netrc',
3531                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3532
3533
3534         video_format.add_option('-f', '--format',
3535                         action='store', dest='format', metavar='FORMAT', help='video format code')
3536         video_format.add_option('--all-formats',
3537                         action='store_const', dest='format', help='download all available video formats', const='all')
3538         video_format.add_option('--max-quality',
3539                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3540
3541
3542         verbosity.add_option('-q', '--quiet',
3543                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3544         verbosity.add_option('-s', '--simulate',
3545                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3546         verbosity.add_option('--skip-download',
3547                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3548         verbosity.add_option('-g', '--get-url',
3549                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3550         verbosity.add_option('-e', '--get-title',
3551                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3552         verbosity.add_option('--get-thumbnail',
3553                         action='store_true', dest='getthumbnail',
3554                         help='simulate, quiet but print thumbnail URL', default=False)
3555         verbosity.add_option('--get-description',
3556                         action='store_true', dest='getdescription',
3557                         help='simulate, quiet but print video description', default=False)
3558         verbosity.add_option('--get-filename',
3559                         action='store_true', dest='getfilename',
3560                         help='simulate, quiet but print output filename', default=False)
3561         verbosity.add_option('--get-format',
3562                         action='store_true', dest='getformat',
3563                         help='simulate, quiet but print output format', default=False)
3564         verbosity.add_option('--no-progress',
3565                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3566         verbosity.add_option('--console-title',
3567                         action='store_true', dest='consoletitle',
3568                         help='display progress in console titlebar', default=False)
3569
3570
3571         filesystem.add_option('-t', '--title',
3572                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3573         filesystem.add_option('-l', '--literal',
3574                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3575         filesystem.add_option('-A', '--auto-number',
3576                         action='store_true', dest='autonumber',
3577                         help='number downloaded files starting from 00000', default=False)
3578         filesystem.add_option('-o', '--output',
3579                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3580         filesystem.add_option('-a', '--batch-file',
3581                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3582         filesystem.add_option('-w', '--no-overwrites',
3583                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3584         filesystem.add_option('-c', '--continue',
3585                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3586         filesystem.add_option('--no-continue',
3587                         action='store_false', dest='continue_dl',
3588                         help='do not resume partially downloaded files (restart from beginning)')
3589         filesystem.add_option('--cookies',
3590                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3591         filesystem.add_option('--no-part',
3592                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3593         filesystem.add_option('--no-mtime',
3594                         action='store_false', dest='updatetime',
3595                         help='do not use the Last-modified header to set the file modification time', default=True)
3596         filesystem.add_option('--write-description',
3597                         action='store_true', dest='writedescription',
3598                         help='write video description to a .description file', default=False)
3599         filesystem.add_option('--write-info-json',
3600                         action='store_true', dest='writeinfojson',
3601                         help='write video metadata to a .info.json file', default=False)
3602
3603
3604         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3605                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3606         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3607                         help='"best", "aac", "vorbis" or "mp3"; best by default')
3608         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3609                         help='ffmpeg audio bitrate specification, 128k by default')
3610         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3611                         help='keeps the video file on disk after the post-processing; the video is erased by default')
3612
3613
3614         parser.add_option_group(general)
3615         parser.add_option_group(selection)
3616         parser.add_option_group(filesystem)
3617         parser.add_option_group(verbosity)
3618         parser.add_option_group(video_format)
3619         parser.add_option_group(authentication)
3620         parser.add_option_group(postproc)
3621
3622         opts, args = parser.parse_args()
3623
3624         return parser, opts, args
3625
3626 def gen_extractors():
3627         """ Return a list of an instance of every supported extractor.
3628         The order does matter; the first extractor matched is the one handling the URL.
3629         """
3630         youtube_ie = YoutubeIE()
3631         google_ie = GoogleIE()
3632         yahoo_ie = YahooIE()
3633         return [
3634                 YoutubePlaylistIE(youtube_ie),
3635                 YoutubeUserIE(youtube_ie),
3636                 YoutubeSearchIE(youtube_ie),
3637                 youtube_ie,
3638                 MetacafeIE(youtube_ie),
3639                 DailymotionIE(),
3640                 google_ie,
3641                 GoogleSearchIE(google_ie),
3642                 PhotobucketIE(),
3643                 yahoo_ie,
3644                 YahooSearchIE(yahoo_ie),
3645                 DepositFilesIE(),
3646                 FacebookIE(),
3647                 BlipTVIE(),
3648                 VimeoIE(),
3649                 MyVideoIE(),
3650                 ComedyCentralIE(),
3651                 EscapistIE(),
3652
3653                 GenericIE()
3654         ]
3655
3656 def main():
3657         parser, opts, args = parseOpts()
3658
3659         # Open appropriate CookieJar
3660         if opts.cookiefile is None:
3661                 jar = cookielib.CookieJar()
3662         else:
3663                 try:
3664                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3665                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3666                                 jar.load()
3667                 except (IOError, OSError), err:
3668                         sys.exit(u'ERROR: unable to open cookie file')
3669
3670         # Dump user agent
3671         if opts.dump_user_agent:
3672                 print std_headers['User-Agent']
3673                 sys.exit(0)
3674
3675         # Batch file verification
3676         batchurls = []
3677         if opts.batchfile is not None:
3678                 try:
3679                         if opts.batchfile == '-':
3680                                 batchfd = sys.stdin
3681                         else:
3682                                 batchfd = open(opts.batchfile, 'r')
3683                         batchurls = batchfd.readlines()
3684                         batchurls = [x.strip() for x in batchurls]
3685                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3686                 except IOError:
3687                         sys.exit(u'ERROR: batch file could not be read')
3688         all_urls = batchurls + args
3689
3690         # General configuration
3691         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3692         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3693         urllib2.install_opener(opener)
3694         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3695
3696         extractors = gen_extractors()
3697
3698         if opts.list_extractors:
3699                 for ie in extractors:
3700                         print(ie.IE_NAME)
3701                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3702                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3703                         for mu in matchedUrls:
3704                                 print(u'  ' + mu)
3705                 sys.exit(0)
3706
3707         # Conflicting, missing and erroneous options
3708         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3709                 parser.error(u'using .netrc conflicts with giving username/password')
3710         if opts.password is not None and opts.username is None:
3711                 parser.error(u'account username missing')
3712         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3713                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3714         if opts.usetitle and opts.useliteral:
3715                 parser.error(u'using title conflicts with using literal title')
3716         if opts.username is not None and opts.password is None:
3717                 opts.password = getpass.getpass(u'Type account password and press return:')
3718         if opts.ratelimit is not None:
3719                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3720                 if numeric_limit is None:
3721                         parser.error(u'invalid rate limit specified')
3722                 opts.ratelimit = numeric_limit
3723         if opts.retries is not None:
3724                 try:
3725                         opts.retries = long(opts.retries)
3726                 except (TypeError, ValueError), err:
3727                         parser.error(u'invalid retry count specified')
3728         try:
3729                 opts.playliststart = int(opts.playliststart)
3730                 if opts.playliststart <= 0:
3731                         raise ValueError(u'Playlist start must be positive')
3732         except (TypeError, ValueError), err:
3733                 parser.error(u'invalid playlist start number specified')
3734         try:
3735                 opts.playlistend = int(opts.playlistend)
3736                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3737                         raise ValueError(u'Playlist end must be greater than playlist start')
3738         except (TypeError, ValueError), err:
3739                 parser.error(u'invalid playlist end number specified')
3740         if opts.extractaudio:
3741                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
3742                         parser.error(u'invalid audio format specified')
3743
3744         # File downloader
3745         fd = FileDownloader({
3746                 'usenetrc': opts.usenetrc,
3747                 'username': opts.username,
3748                 'password': opts.password,
3749                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3750                 'forceurl': opts.geturl,
3751                 'forcetitle': opts.gettitle,
3752                 'forcethumbnail': opts.getthumbnail,
3753                 'forcedescription': opts.getdescription,
3754                 'forcefilename': opts.getfilename,
3755                 'forceformat': opts.getformat,
3756                 'simulate': opts.simulate,
3757                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3758                 'format': opts.format,
3759                 'format_limit': opts.format_limit,
3760                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3761                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3762                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3763                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3764                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3765                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3766                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3767                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3768                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3769                         or u'%(id)s.%(ext)s'),
3770                 'ignoreerrors': opts.ignoreerrors,
3771                 'ratelimit': opts.ratelimit,
3772                 'nooverwrites': opts.nooverwrites,
3773                 'retries': opts.retries,
3774                 'continuedl': opts.continue_dl,
3775                 'noprogress': opts.noprogress,
3776                 'playliststart': opts.playliststart,
3777                 'playlistend': opts.playlistend,
3778                 'logtostderr': opts.outtmpl == '-',
3779                 'consoletitle': opts.consoletitle,
3780                 'nopart': opts.nopart,
3781                 'updatetime': opts.updatetime,
3782                 'writedescription': opts.writedescription,
3783                 'writeinfojson': opts.writeinfojson,
3784                 'matchtitle': opts.matchtitle,
3785                 'rejecttitle': opts.rejecttitle,
3786                 })
3787         for extractor in extractors:
3788                 fd.add_info_extractor(extractor)
3789
3790         # PostProcessors
3791         if opts.extractaudio:
3792                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3793
3794         # Update version
3795         if opts.update_self:
3796                 updateSelf(fd, sys.argv[0])
3797
3798         # Maybe do nothing
3799         if len(all_urls) < 1:
3800                 if not opts.update_self:
3801                         parser.error(u'you must provide at least one URL')
3802                 else:
3803                         sys.exit()
3804         retcode = fd.download(all_urls)
3805
3806         # Dump cookie jar if requested
3807         if opts.cookiefile is not None:
3808                 try:
3809                         jar.save()
3810                 except (IOError, OSError), err:
3811                         sys.exit(u'ERROR: unable to save cookie jar')
3812
3813         sys.exit(retcode)
3814
3815
3816 if __name__ == '__main__':
3817         try:
3818                 main()
3819         except DownloadError:
3820                 sys.exit(1)
3821         except SameFileError:
3822                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3823         except KeyboardInterrupt:
3824                 sys.exit(u'\nERROR: Interrupted by user')
3825
3826 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: