685bd2b8c5875ef835a27a2f262dac5d1c7c73ba
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         )
16
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.16'
19
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
21
22 import cookielib
23 import datetime
24 import gzip
25 import htmlentitydefs
26 import HTMLParser
27 import httplib
28 import locale
29 import math
30 import netrc
31 import os
32 import os.path
33 import re
34 import socket
35 import string
36 import subprocess
37 import sys
38 import time
39 import urllib
40 import urllib2
41 import warnings
42 import zlib
43
44 if os.name == 'nt':
45         import ctypes
46
47 try:
48         import email.utils
49 except ImportError: # Python 2.4
50         import email.Utils
51 try:
52         import cStringIO as StringIO
53 except ImportError:
54         import StringIO
55
56 # parse_qs was moved from the cgi module to the urlparse module recently.
57 try:
58         from urlparse import parse_qs
59 except ImportError:
60         from cgi import parse_qs
61
62 try:
63         import lxml.etree
64 except ImportError:
65         pass # Handled below
66
67 try:
68         import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
71
72 std_headers = {
73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76         'Accept-Encoding': 'gzip, deflate',
77         'Accept-Language': 'en-us,en;q=0.5',
78 }
79
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
82 try:
83         import json
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
85         import re
86         class json(object):
87                 @staticmethod
88                 def loads(s):
89                         s = s.decode('UTF-8')
90                         def raiseError(msg, i):
91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92                         def skipSpace(i, expectMore=True):
93                                 while i < len(s) and s[i] in ' \t\r\n':
94                                         i += 1
95                                 if expectMore:
96                                         if i >= len(s):
97                                                 raiseError('Premature end', i)
98                                 return i
99                         def decodeEscape(match):
100                                 esc = match.group(1)
101                                 _STATIC = {
102                                         '"': '"',
103                                         '\\': '\\',
104                                         '/': '/',
105                                         'b': unichr(0x8),
106                                         'f': unichr(0xc),
107                                         'n': '\n',
108                                         'r': '\r',
109                                         't': '\t',
110                                 }
111                                 if esc in _STATIC:
112                                         return _STATIC[esc]
113                                 if esc[0] == 'u':
114                                         if len(esc) == 1+4:
115                                                 return unichr(int(esc[1:5], 16))
116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
117                                                 hi = int(esc[1:5], 16)
118                                                 low = int(esc[7:11], 16)
119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120                                 raise ValueError('Unknown escape ' + str(esc))
121                         def parseString(i):
122                                 i += 1
123                                 e = i
124                                 while True:
125                                         e = s.index('"', e)
126                                         bslashes = 0
127                                         while s[e-bslashes-1] == '\\':
128                                                 bslashes += 1
129                                         if bslashes % 2 == 1:
130                                                 e += 1
131                                                 continue
132                                         break
133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134                                 stri = rexp.sub(decodeEscape, s[i:e])
135                                 return (e+1,stri)
136                         def parseObj(i):
137                                 i += 1
138                                 res = {}
139                                 i = skipSpace(i)
140                                 if s[i] == '}': # Empty dictionary
141                                         return (i+1,res)
142                                 while True:
143                                         if s[i] != '"':
144                                                 raiseError('Expected a string object key', i)
145                                         i,key = parseString(i)
146                                         i = skipSpace(i)
147                                         if i >= len(s) or s[i] != ':':
148                                                 raiseError('Expected a colon', i)
149                                         i,val = parse(i+1)
150                                         res[key] = val
151                                         i = skipSpace(i)
152                                         if s[i] == '}':
153                                                 return (i+1, res)
154                                         if s[i] != ',':
155                                                 raiseError('Expected comma or closing curly brace', i)
156                                         i = skipSpace(i+1)
157                         def parseArray(i):
158                                 res = []
159                                 i = skipSpace(i+1)
160                                 if s[i] == ']': # Empty array
161                                         return (i+1,res)
162                                 while True:
163                                         i,val = parse(i)
164                                         res.append(val)
165                                         i = skipSpace(i) # Raise exception if premature end
166                                         if s[i] == ']':
167                                                 return (i+1, res)
168                                         if s[i] != ',':
169                                                 raiseError('Expected a comma or closing bracket', i)
170                                         i = skipSpace(i+1)
171                         def parseDiscrete(i):
172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
173                                         if s.startswith(k, i):
174                                                 return (i+len(k), v)
175                                 raiseError('Not a boolean (or null)', i)
176                         def parseNumber(i):
177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178                                 if mobj is None:
179                                         raiseError('Not a number', i)
180                                 nums = mobj.group(1)
181                                 if '.' in nums or 'e' in nums or 'E' in nums:
182                                         return (i+len(nums), float(nums))
183                                 return (i+len(nums), int(nums))
184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185                         def parse(i):
186                                 i = skipSpace(i)
187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
188                                 i = skipSpace(i, False)
189                                 return (i,res)
190                         i,res = parse(0)
191                         if i < len(s):
192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193                         return res
194
195 def preferredencoding():
196         """Get preferred encoding.
197
198         Returns the best encoding scheme for the system, based on
199         locale.getpreferredencoding() and some further tweaks.
200         """
201         def yield_preferredencoding():
202                 try:
203                         pref = locale.getpreferredencoding()
204                         u'TEST'.encode(pref)
205                 except:
206                         pref = 'UTF-8'
207                 while True:
208                         yield pref
209         return yield_preferredencoding().next()
210
211
212 def htmlentity_transform(matchobj):
213         """Transforms an HTML entity to a Unicode character.
214
215         This function receives a match object and is intended to be used with
216         the re.sub() function.
217         """
218         entity = matchobj.group(1)
219
220         # Known non-numeric HTML entity
221         if entity in htmlentitydefs.name2codepoint:
222                 return unichr(htmlentitydefs.name2codepoint[entity])
223
224         # Unicode character
225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
226         if mobj is not None:
227                 numstr = mobj.group(1)
228                 if numstr.startswith(u'x'):
229                         base = 16
230                         numstr = u'0%s' % numstr
231                 else:
232                         base = 10
233                 return unichr(long(numstr, base))
234
235         # Unknown entity in name, return its literal representation
236         return (u'&%s;' % entity)
237
238
239 def sanitize_title(utitle):
240         """Sanitizes a video title so it could be used as part of a filename."""
241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242         return utitle.replace(unicode(os.sep), u'%')
243
244
245 def sanitize_open(filename, open_mode):
246         """Try to open the given filename, and slightly tweak it if this fails.
247
248         Attempts to open the given filename. If this fails, it tries to change
249         the filename slightly, step by step, until it's either able to open it
250         or it fails and raises a final exception, like the standard open()
251         function.
252
253         It returns the tuple (stream, definitive_file_name).
254         """
255         try:
256                 if filename == u'-':
257                         if sys.platform == 'win32':
258                                 import msvcrt
259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260                         return (sys.stdout, filename)
261                 stream = open(filename, open_mode)
262                 return (stream, filename)
263         except (IOError, OSError), err:
264                 # In case of error, try to remove win32 forbidden chars
265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
266
267                 # An exception here should be caught in the caller
268                 stream = open(filename, open_mode)
269                 return (stream, filename)
270
271
272 def timeconvert(timestr):
273         """Convert RFC 2822 defined time string into system timestamp"""
274         timestamp = None
275         timetuple = email.utils.parsedate_tz(timestr)
276         if timetuple is not None:
277                 timestamp = email.utils.mktime_tz(timetuple)
278         return timestamp
279
280
281 class DownloadError(Exception):
282         """Download Error exception.
283
284         This exception may be thrown by FileDownloader objects if they are not
285         configured to continue on errors. They will contain the appropriate
286         error message.
287         """
288         pass
289
290
291 class SameFileError(Exception):
292         """Same File exception.
293
294         This exception will be thrown by FileDownloader objects if they detect
295         multiple files would have to be downloaded to the same file on disk.
296         """
297         pass
298
299
300 class PostProcessingError(Exception):
301         """Post Processing exception.
302
303         This exception may be raised by PostProcessor's .run() method to
304         indicate an error in the postprocessing task.
305         """
306         pass
307
308
309 class UnavailableVideoError(Exception):
310         """Unavailable Format exception.
311
312         This exception will be thrown when a video is requested
313         in a format that is not available for that video.
314         """
315         pass
316
317
318 class ContentTooShortError(Exception):
319         """Content Too Short exception.
320
321         This exception may be raised by FileDownloader objects when a file they
322         download is too small for what the server announced first, indicating
323         the connection was probably interrupted.
324         """
325         # Both in bytes
326         downloaded = None
327         expected = None
328
329         def __init__(self, downloaded, expected):
330                 self.downloaded = downloaded
331                 self.expected = expected
332
333
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335         """Handler for HTTP requests and responses.
336
337         This class, when installed with an OpenerDirector, automatically adds
338         the standard headers to every HTTP request and handles gzipped and
339         deflated responses from web servers. If compression is to be avoided in
340         a particular request, the original request in the program code only has
341         to include the HTTP header "Youtubedl-No-Compression", which will be
342         removed before making the real request.
343
344         Part of this code was copied from:
345
346         http://techknack.net/python-urllib2-handlers/
347
348         Andrew Rowls, the author of that code, agreed to release it to the
349         public domain.
350         """
351
352         @staticmethod
353         def deflate(data):
354                 try:
355                         return zlib.decompress(data, -zlib.MAX_WBITS)
356                 except zlib.error:
357                         return zlib.decompress(data)
358
359         @staticmethod
360         def addinfourl_wrapper(stream, headers, url, code):
361                 if hasattr(urllib2.addinfourl, 'getcode'):
362                         return urllib2.addinfourl(stream, headers, url, code)
363                 ret = urllib2.addinfourl(stream, headers, url)
364                 ret.code = code
365                 return ret
366
367         def http_request(self, req):
368                 for h in std_headers:
369                         if h in req.headers:
370                                 del req.headers[h]
371                         req.add_header(h, std_headers[h])
372                 if 'Youtubedl-no-compression' in req.headers:
373                         if 'Accept-encoding' in req.headers:
374                                 del req.headers['Accept-encoding']
375                         del req.headers['Youtubedl-no-compression']
376                 return req
377
378         def http_response(self, req, resp):
379                 old_resp = resp
380                 # gzip
381                 if resp.headers.get('Content-encoding', '') == 'gzip':
382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384                         resp.msg = old_resp.msg
385                 # deflate
386                 if resp.headers.get('Content-encoding', '') == 'deflate':
387                         gz = StringIO.StringIO(self.deflate(resp.read()))
388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389                         resp.msg = old_resp.msg
390                 return resp
391
392
393 class FileDownloader(object):
394         """File Downloader class.
395
396         File downloader objects are the ones responsible of downloading the
397         actual video file and writing it to disk if the user has requested
398         it, among some other tasks. In most cases there should be one per
399         program. As, given a video URL, the downloader doesn't know how to
400         extract all the needed information, task that InfoExtractors do, it
401         has to pass the URL to one of them.
402
403         For this, file downloader objects have a method that allows
404         InfoExtractors to be registered in a given order. When it is passed
405         a URL, the file downloader handles it to the first InfoExtractor it
406         finds that reports being able to handle it. The InfoExtractor extracts
407         all the information about the video or videos the URL refers to, and
408         asks the FileDownloader to process the video information, possibly
409         downloading the video.
410
411         File downloaders accept a lot of parameters. In order not to saturate
412         the object constructor with arguments, it receives a dictionary of
413         options instead. These options are available through the params
414         attribute for the InfoExtractors to use. The FileDownloader also
415         registers itself as the downloader in charge for the InfoExtractors
416         that are added to it, so this is a "mutual registration".
417
418         Available options:
419
420         username:         Username for authentication purposes.
421         password:         Password for authentication purposes.
422         usenetrc:         Use netrc for authentication instead.
423         quiet:            Do not print messages to stdout.
424         forceurl:         Force printing final URL.
425         forcetitle:       Force printing title.
426         forcethumbnail:   Force printing thumbnail URL.
427         forcedescription: Force printing description.
428         forcefilename:    Force printing final filename.
429         simulate:         Do not download the video files.
430         format:           Video format code.
431         format_limit:     Highest quality format to try.
432         outtmpl:          Template for output names.
433         ignoreerrors:     Do not stop on download errors.
434         ratelimit:        Download speed limit, in bytes/sec.
435         nooverwrites:     Prevent overwriting files.
436         retries:          Number of times to retry for HTTP error 5xx
437         continuedl:       Try to continue downloads if possible.
438         noprogress:       Do not print the progress bar.
439         playliststart:    Playlist item to start at.
440         playlistend:      Playlist item to end at.
441         matchtitle:       Download only matching titles.
442         rejecttitle:      Reject downloads for matching titles.
443         logtostderr:      Log messages to stderr instead of stdout.
444         consoletitle:     Display progress in console window's titlebar.
445         nopart:           Do not use temporary .part files.
446         updatetime:       Use the Last-modified header to set output file timestamps.
447         writedescription: Write the video description to a .description file
448         writeinfojson:    Write the video description to a .info.json file
449         """
450
451         params = None
452         _ies = []
453         _pps = []
454         _download_retcode = None
455         _num_downloads = None
456         _screen_file = None
457
458         def __init__(self, params):
459                 """Create a FileDownloader object with the given options."""
460                 self._ies = []
461                 self._pps = []
462                 self._download_retcode = 0
463                 self._num_downloads = 0
464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
465                 self.params = params
466
467         @staticmethod
468         def format_bytes(bytes):
469                 if bytes is None:
470                         return 'N/A'
471                 if type(bytes) is str:
472                         bytes = float(bytes)
473                 if bytes == 0.0:
474                         exponent = 0
475                 else:
476                         exponent = long(math.log(bytes, 1024.0))
477                 suffix = 'bkMGTPEZY'[exponent]
478                 converted = float(bytes) / float(1024 ** exponent)
479                 return '%.2f%s' % (converted, suffix)
480
481         @staticmethod
482         def calc_percent(byte_counter, data_len):
483                 if data_len is None:
484                         return '---.-%'
485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
487         @staticmethod
488         def calc_eta(start, now, total, current):
489                 if total is None:
490                         return '--:--'
491                 dif = now - start
492                 if current == 0 or dif < 0.001: # One millisecond
493                         return '--:--'
494                 rate = float(current) / dif
495                 eta = long((float(total) - float(current)) / rate)
496                 (eta_mins, eta_secs) = divmod(eta, 60)
497                 if eta_mins > 99:
498                         return '--:--'
499                 return '%02d:%02d' % (eta_mins, eta_secs)
500
501         @staticmethod
502         def calc_speed(start, now, bytes):
503                 dif = now - start
504                 if bytes == 0 or dif < 0.001: # One millisecond
505                         return '%10s' % '---b/s'
506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
508         @staticmethod
509         def best_block_size(elapsed_time, bytes):
510                 new_min = max(bytes / 2.0, 1.0)
511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512                 if elapsed_time < 0.001:
513                         return long(new_max)
514                 rate = bytes / elapsed_time
515                 if rate > new_max:
516                         return long(new_max)
517                 if rate < new_min:
518                         return long(new_min)
519                 return long(rate)
520
521         @staticmethod
522         def parse_bytes(bytestr):
523                 """Parse a string indicating a byte quantity into a long integer."""
524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525                 if matchobj is None:
526                         return None
527                 number = float(matchobj.group(1))
528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529                 return long(round(number * multiplier))
530
531         def add_info_extractor(self, ie):
532                 """Add an InfoExtractor object to the end of the list."""
533                 self._ies.append(ie)
534                 ie.set_downloader(self)
535
536         def add_post_processor(self, pp):
537                 """Add a PostProcessor object to the end of the chain."""
538                 self._pps.append(pp)
539                 pp.set_downloader(self)
540
541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542                 """Print message to stdout if not in quiet mode."""
543                 try:
544                         if not self.params.get('quiet', False):
545                                 terminator = [u'\n', u''][skip_eol]
546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547                         self._screen_file.flush()
548                 except (UnicodeEncodeError), err:
549                         if not ignore_encoding_errors:
550                                 raise
551
552         def to_stderr(self, message):
553                 """Print message to stderr."""
554                 print >>sys.stderr, message.encode(preferredencoding())
555
556         def to_cons_title(self, message):
557                 """Set console/terminal window title to message."""
558                 if not self.params.get('consoletitle', False):
559                         return
560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561                         # c_wchar_p() might not be necessary if `message` is
562                         # already of type unicode()
563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564                 elif 'TERM' in os.environ:
565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
567         def fixed_template(self):
568                 """Checks if the output template is fixed."""
569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
570
571         def trouble(self, message=None):
572                 """Determine action to take when a download problem appears.
573
574                 Depending on if the downloader has been configured to ignore
575                 download errors or not, this method may throw an exception or
576                 not when errors are found, after printing the message.
577                 """
578                 if message is not None:
579                         self.to_stderr(message)
580                 if not self.params.get('ignoreerrors', False):
581                         raise DownloadError(message)
582                 self._download_retcode = 1
583
584         def slow_down(self, start_time, byte_counter):
585                 """Sleep if the download speed is over the rate limit."""
586                 rate_limit = self.params.get('ratelimit', None)
587                 if rate_limit is None or byte_counter == 0:
588                         return
589                 now = time.time()
590                 elapsed = now - start_time
591                 if elapsed <= 0.0:
592                         return
593                 speed = float(byte_counter) / elapsed
594                 if speed > rate_limit:
595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
596
597         def temp_name(self, filename):
598                 """Returns a temporary filename for the given filename."""
599                 if self.params.get('nopart', False) or filename == u'-' or \
600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
601                         return filename
602                 return filename + u'.part'
603
604         def undo_temp_name(self, filename):
605                 if filename.endswith(u'.part'):
606                         return filename[:-len(u'.part')]
607                 return filename
608
609         def try_rename(self, old_filename, new_filename):
610                 try:
611                         if old_filename == new_filename:
612                                 return
613                         os.rename(old_filename, new_filename)
614                 except (IOError, OSError), err:
615                         self.trouble(u'ERROR: unable to rename file')
616
617         def try_utime(self, filename, last_modified_hdr):
618                 """Try to set the last-modified time of the given file."""
619                 if last_modified_hdr is None:
620                         return
621                 if not os.path.isfile(filename):
622                         return
623                 timestr = last_modified_hdr
624                 if timestr is None:
625                         return
626                 filetime = timeconvert(timestr)
627                 if filetime is None:
628                         return filetime
629                 try:
630                         os.utime(filename, (time.time(), filetime))
631                 except:
632                         pass
633                 return filetime
634
635         def report_writedescription(self, descfn):
636                 """ Report that the description file is being written """
637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638
639         def report_writeinfojson(self, infofn):
640                 """ Report that the metadata file has been written """
641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
642
643         def report_destination(self, filename):
644                 """Report destination filename."""
645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
646
647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648                 """Report download progress."""
649                 if self.params.get('noprogress', False):
650                         return
651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
655
656         def report_resuming_byte(self, resume_len):
657                 """Report attempt to resume at given byte."""
658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
659
660         def report_retry(self, count, retries):
661                 """Report retry in case of HTTP error 5xx"""
662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
663
664         def report_file_already_downloaded(self, file_name):
665                 """Report file has already been fully downloaded."""
666                 try:
667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
668                 except (UnicodeEncodeError), err:
669                         self.to_screen(u'[download] The file has already been downloaded')
670
671         def report_unable_to_resume(self):
672                 """Report it was impossible to resume download."""
673                 self.to_screen(u'[download] Unable to resume')
674
675         def report_finish(self):
676                 """Report download finished."""
677                 if self.params.get('noprogress', False):
678                         self.to_screen(u'[download] Download completed')
679                 else:
680                         self.to_screen(u'')
681
682         def increment_downloads(self):
683                 """Increment the ordinal that assigns a number to each file."""
684                 self._num_downloads += 1
685
686         def prepare_filename(self, info_dict):
687                 """Generate the output filename."""
688                 try:
689                         template_dict = dict(info_dict)
690                         template_dict['epoch'] = unicode(long(time.time()))
691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692                         filename = self.params['outtmpl'] % template_dict
693                         return filename
694                 except (ValueError, KeyError), err:
695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
696                         return None
697
698         def process_info(self, info_dict):
699                 """Process a single dictionary returned by an InfoExtractor."""
700                 filename = self.prepare_filename(info_dict)
701                 
702                 # Forced printings
703                 if self.params.get('forcetitle', False):
704                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705                 if self.params.get('forceurl', False):
706                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709                 if self.params.get('forcedescription', False) and 'description' in info_dict:
710                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711                 if self.params.get('forcefilename', False) and filename is not None:
712                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713                 if self.params.get('forceformat', False):
714                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
715
716                 # Do nothing else if in simulate mode
717                 if self.params.get('simulate', False):
718                         return
719
720                 if filename is None:
721                         return
722
723                 matchtitle=self.params.get('matchtitle',False)
724                 rejecttitle=self.params.get('rejecttitle',False)
725                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
728                         return
729                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
731                         return
732                         
733                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734                         self.to_stderr(u'WARNING: file exists and will be skipped')
735                         return
736
737                 try:
738                         dn = os.path.dirname(filename)
739                         if dn != '' and not os.path.exists(dn):
740                                 os.makedirs(dn)
741                 except (OSError, IOError), err:
742                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
743                         return
744
745                 if self.params.get('writedescription', False):
746                         try:
747                                 descfn = filename + '.description'
748                                 self.report_writedescription(descfn)
749                                 descfile = open(descfn, 'wb')
750                                 try:
751                                         descfile.write(info_dict['description'].encode('utf-8'))
752                                 finally:
753                                         descfile.close()
754                         except (OSError, IOError):
755                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
756                                 return
757
758                 if self.params.get('writeinfojson', False):
759                         infofn = filename + '.info.json'
760                         self.report_writeinfojson(infofn)
761                         try:
762                                 json.dump
763                         except (NameError,AttributeError):
764                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
765                                 return
766                         try:
767                                 infof = open(infofn, 'wb')
768                                 try:
769                                         json.dump(info_dict, infof)
770                                 finally:
771                                         infof.close()
772                         except (OSError, IOError):
773                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
774                                 return
775
776                 if not self.params.get('skip_download', False):
777                         try:
778                                 success,add_data = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
779                                 info_dict.update(add_data)
780                         except (OSError, IOError), err:
781                                 raise UnavailableVideoError
782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
784                                 return
785                         except (ContentTooShortError, ), err:
786                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
787                                 return
788         
789                         if success:
790                                 try:
791                                         self.post_process(filename, info_dict)
792                                 except (PostProcessingError), err:
793                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
794                                         return
795
796         def download(self, url_list):
797                 """Download a given list of URLs."""
798                 if len(url_list) > 1 and self.fixed_template():
799                         raise SameFileError(self.params['outtmpl'])
800
801                 for url in url_list:
802                         suitable_found = False
803                         for ie in self._ies:
804                                 # Go to next InfoExtractor if not suitable
805                                 if not ie.suitable(url):
806                                         continue
807
808                                 # Suitable InfoExtractor found
809                                 suitable_found = True
810
811                                 # Extract information from URL and process it
812                                 ie.extract(url)
813
814                                 # Suitable InfoExtractor had been found; go to next URL
815                                 break
816
817                         if not suitable_found:
818                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
819
820                 return self._download_retcode
821
822         def post_process(self, filename, ie_info):
823                 """Run the postprocessing chain on the given file."""
824                 info = dict(ie_info)
825                 info['filepath'] = filename
826                 for pp in self._pps:
827                         info = pp.run(info)
828                         if info is None:
829                                 break
830
831         def _download_with_rtmpdump(self, filename, url, player_url):
832                 self.report_destination(filename)
833                 tmpfilename = self.temp_name(filename)
834
835                 # Check for rtmpdump first
836                 try:
837                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
838                 except (OSError, IOError):
839                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
840                         return False
841
842                 # Download using rtmpdump. rtmpdump returns exit code 2 when
843                 # the connection was interrumpted and resuming appears to be
844                 # possible. This is part of rtmpdump's normal usage, AFAIK.
845                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
846                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
847                 while retval == 2 or retval == 1:
848                         prevsize = os.path.getsize(tmpfilename)
849                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
850                         time.sleep(5.0) # This seems to be needed
851                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
852                         cursize = os.path.getsize(tmpfilename)
853                         if prevsize == cursize and retval == 1:
854                                 break
855                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
856                         if prevsize == cursize and retval == 2 and cursize > 1024:
857                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
858                                 retval = 0
859                                 break
860                 if retval == 0:
861                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
862                         self.try_rename(tmpfilename, filename)
863                         return True
864                 else:
865                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
866                         return False
867
868         def _do_download(self, filename, url, player_url):
869                 # Check file already present
870                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
871                         self.report_file_already_downloaded(filename)
872                         return True
873
874                 # Attempt to download using rtmpdump
875                 if url.startswith('rtmp'):
876                         return self._download_with_rtmpdump(filename, url, player_url)
877
878                 tmpfilename = self.temp_name(filename)
879                 stream = None
880                 open_mode = 'wb'
881
882                 # Do not include the Accept-Encoding header
883                 headers = {'Youtubedl-no-compression': 'True'}
884                 basic_request = urllib2.Request(url, None, headers)
885                 request = urllib2.Request(url, None, headers)
886
887                 # Establish possible resume length
888                 if os.path.isfile(tmpfilename):
889                         resume_len = os.path.getsize(tmpfilename)
890                 else:
891                         resume_len = 0
892
893                 # Request parameters in case of being able to resume
894                 if self.params.get('continuedl', False) and resume_len != 0:
895                         self.report_resuming_byte(resume_len)
896                         request.add_header('Range', 'bytes=%d-' % resume_len)
897                         open_mode = 'ab'
898
899                 count = 0
900                 retries = self.params.get('retries', 0)
901                 while count <= retries:
902                         # Establish connection
903                         try:
904                                 data = urllib2.urlopen(request)
905                                 break
906                         except (urllib2.HTTPError, ), err:
907                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
908                                         # Unexpected HTTP error
909                                         raise
910                                 elif err.code == 416:
911                                         # Unable to resume (requested range not satisfiable)
912                                         try:
913                                                 # Open the connection again without the range header
914                                                 data = urllib2.urlopen(basic_request)
915                                                 content_length = data.info()['Content-Length']
916                                         except (urllib2.HTTPError, ), err:
917                                                 if err.code < 500 or err.code >= 600:
918                                                         raise
919                                         else:
920                                                 # Examine the reported length
921                                                 if (content_length is not None and
922                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
923                                                         # The file had already been fully downloaded.
924                                                         # Explanation to the above condition: in issue #175 it was revealed that
925                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
926                                                         # changing the file size slightly and causing problems for some users. So
927                                                         # I decided to implement a suggested change and consider the file
928                                                         # completely downloaded if the file size differs less than 100 bytes from
929                                                         # the one in the hard drive.
930                                                         self.report_file_already_downloaded(filename)
931                                                         self.try_rename(tmpfilename, filename)
932                                                         return True
933                                                 else:
934                                                         # The length does not match, we start the download over
935                                                         self.report_unable_to_resume()
936                                                         open_mode = 'wb'
937                                                         break
938                         # Retry
939                         count += 1
940                         if count <= retries:
941                                 self.report_retry(count, retries)
942
943                 if count > retries:
944                         self.trouble(u'ERROR: giving up after %s retries' % retries)
945                         return False
946
947                 data_len = data.info().get('Content-length', None)
948                 if data_len is not None:
949                         data_len = long(data_len) + resume_len
950                 data_len_str = self.format_bytes(data_len)
951                 byte_counter = 0 + resume_len
952                 block_size = 1024
953                 start = time.time()
954                 while True:
955                         # Download and write
956                         before = time.time()
957                         data_block = data.read(block_size)
958                         after = time.time()
959                         if len(data_block) == 0:
960                                 break
961                         byte_counter += len(data_block)
962
963                         # Open file just in time
964                         if stream is None:
965                                 try:
966                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
967                                         assert stream is not None
968                                         filename = self.undo_temp_name(tmpfilename)
969                                         self.report_destination(filename)
970                                 except (OSError, IOError), err:
971                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
972                                         return False
973                         try:
974                                 stream.write(data_block)
975                         except (IOError, OSError), err:
976                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
977                                 return False
978                         block_size = self.best_block_size(after - before, len(data_block))
979
980                         # Progress message
981                         percent_str = self.calc_percent(byte_counter, data_len)
982                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
983                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
984                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
985
986                         # Apply rate limit
987                         self.slow_down(start, byte_counter - resume_len)
988
989                 if stream is None:
990                         self.trouble(u'\nERROR: Did not get any data blocks')
991                         return False
992                 stream.close()
993                 self.report_finish()
994                 if data_len is not None and byte_counter != data_len:
995                         raise ContentTooShortError(byte_counter, long(data_len))
996                 self.try_rename(tmpfilename, filename)
997
998                 # Update file modification time
999                 filetime = None
1000                 if self.params.get('updatetime', True):
1001                         filetime = self.try_utime(filename, data.info().get('last-modified', None))
1002
1003                 return True, {'filetime': filetime}
1004
1005
1006 class InfoExtractor(object):
1007         """Information Extractor class.
1008
1009         Information extractors are the classes that, given a URL, extract
1010         information from the video (or videos) the URL refers to. This
1011         information includes the real video URL, the video title and simplified
1012         title, author and others. The information is stored in a dictionary
1013         which is then passed to the FileDownloader. The FileDownloader
1014         processes this information possibly downloading the video to the file
1015         system, among other possible outcomes. The dictionaries must include
1016         the following fields:
1017
1018         id:             Video identifier.
1019         url:            Final video URL.
1020         uploader:       Nickname of the video uploader.
1021         title:          Literal title.
1022         stitle:         Simplified title.
1023         ext:            Video filename extension.
1024         format:         Video format.
1025         player_url:     SWF Player URL (may be None).
1026
1027         The following fields are optional. Their primary purpose is to allow
1028         youtube-dl to serve as the backend for a video search function, such
1029         as the one in youtube2mp3.  They are only used when their respective
1030         forced printing functions are called:
1031
1032         thumbnail:      Full URL to a video thumbnail image.
1033         description:    One-line video description.
1034
1035         Subclasses of this one should re-define the _real_initialize() and
1036         _real_extract() methods and define a _VALID_URL regexp.
1037         Probably, they should also be added to the list of extractors.
1038         """
1039
1040         _ready = False
1041         _downloader = None
1042
1043         def __init__(self, downloader=None):
1044                 """Constructor. Receives an optional downloader."""
1045                 self._ready = False
1046                 self.set_downloader(downloader)
1047
1048         def suitable(self, url):
1049                 """Receives a URL and returns True if suitable for this IE."""
1050                 return re.match(self._VALID_URL, url) is not None
1051
1052         def initialize(self):
1053                 """Initializes an instance (authentication, etc)."""
1054                 if not self._ready:
1055                         self._real_initialize()
1056                         self._ready = True
1057
1058         def extract(self, url):
1059                 """Extracts URL information and returns it in list of dicts."""
1060                 self.initialize()
1061                 return self._real_extract(url)
1062
1063         def set_downloader(self, downloader):
1064                 """Sets the downloader for this IE."""
1065                 self._downloader = downloader
1066
1067         def _real_initialize(self):
1068                 """Real initialization process. Redefine in subclasses."""
1069                 pass
1070
1071         def _real_extract(self, url):
1072                 """Real extraction process. Redefine in subclasses."""
1073                 pass
1074
1075
1076 class YoutubeIE(InfoExtractor):
1077         """Information extractor for youtube.com."""
1078
1079         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1080         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1081         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1082         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1083         _NETRC_MACHINE = 'youtube'
1084         # Listed in order of quality
1085         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1086         _video_extensions = {
1087                 '13': '3gp',
1088                 '17': 'mp4',
1089                 '18': 'mp4',
1090                 '22': 'mp4',
1091                 '37': 'mp4',
1092                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1093                 '43': 'webm',
1094                 '45': 'webm',
1095         }
1096         IE_NAME = u'youtube'
1097
1098         def report_lang(self):
1099                 """Report attempt to set language."""
1100                 self._downloader.to_screen(u'[youtube] Setting language')
1101
1102         def report_login(self):
1103                 """Report attempt to log in."""
1104                 self._downloader.to_screen(u'[youtube] Logging in')
1105
1106         def report_age_confirmation(self):
1107                 """Report attempt to confirm age."""
1108                 self._downloader.to_screen(u'[youtube] Confirming age')
1109
1110         def report_video_webpage_download(self, video_id):
1111                 """Report attempt to download video webpage."""
1112                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1113
1114         def report_video_info_webpage_download(self, video_id):
1115                 """Report attempt to download video info webpage."""
1116                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1117
1118         def report_information_extraction(self, video_id):
1119                 """Report attempt to extract video information."""
1120                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1121
1122         def report_unavailable_format(self, video_id, format):
1123                 """Report extracted video URL."""
1124                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1125
1126         def report_rtmp_download(self):
1127                 """Indicate the download will use the RTMP protocol."""
1128                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1129
1130         def _real_initialize(self):
1131                 if self._downloader is None:
1132                         return
1133
1134                 username = None
1135                 password = None
1136                 downloader_params = self._downloader.params
1137
1138                 # Attempt to use provided username and password or .netrc data
1139                 if downloader_params.get('username', None) is not None:
1140                         username = downloader_params['username']
1141                         password = downloader_params['password']
1142                 elif downloader_params.get('usenetrc', False):
1143                         try:
1144                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1145                                 if info is not None:
1146                                         username = info[0]
1147                                         password = info[2]
1148                                 else:
1149                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1150                         except (IOError, netrc.NetrcParseError), err:
1151                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1152                                 return
1153
1154                 # Set language
1155                 request = urllib2.Request(self._LANG_URL)
1156                 try:
1157                         self.report_lang()
1158                         urllib2.urlopen(request).read()
1159                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1160                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1161                         return
1162
1163                 # No authentication to be performed
1164                 if username is None:
1165                         return
1166
1167                 # Log in
1168                 login_form = {
1169                                 'current_form': 'loginForm',
1170                                 'next':         '/',
1171                                 'action_login': 'Log In',
1172                                 'username':     username,
1173                                 'password':     password,
1174                                 }
1175                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1176                 try:
1177                         self.report_login()
1178                         login_results = urllib2.urlopen(request).read()
1179                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1180                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1181                                 return
1182                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1183                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1184                         return
1185
1186                 # Confirm age
1187                 age_form = {
1188                                 'next_url':             '/',
1189                                 'action_confirm':       'Confirm',
1190                                 }
1191                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1192                 try:
1193                         self.report_age_confirmation()
1194                         age_results = urllib2.urlopen(request).read()
1195                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1196                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1197                         return
1198
1199         def _real_extract(self, url):
1200                 # Extract video id from URL
1201                 mobj = re.match(self._VALID_URL, url)
1202                 if mobj is None:
1203                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1204                         return
1205                 video_id = mobj.group(2)
1206
1207                 # Get video webpage
1208                 self.report_video_webpage_download(video_id)
1209                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1210                 try:
1211                         video_webpage = urllib2.urlopen(request).read()
1212                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1213                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1214                         return
1215
1216                 # Attempt to extract SWF player URL
1217                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1218                 if mobj is not None:
1219                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1220                 else:
1221                         player_url = None
1222
1223                 # Get video info
1224                 self.report_video_info_webpage_download(video_id)
1225                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1226                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1227                                         % (video_id, el_type))
1228                         request = urllib2.Request(video_info_url)
1229                         try:
1230                                 video_info_webpage = urllib2.urlopen(request).read()
1231                                 video_info = parse_qs(video_info_webpage)
1232                                 if 'token' in video_info:
1233                                         break
1234                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1235                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1236                                 return
1237                 if 'token' not in video_info:
1238                         if 'reason' in video_info:
1239                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1240                         else:
1241                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1242                         return
1243
1244                 # Start extracting information
1245                 self.report_information_extraction(video_id)
1246
1247                 # uploader
1248                 if 'author' not in video_info:
1249                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1250                         return
1251                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1252
1253                 # title
1254                 if 'title' not in video_info:
1255                         self._downloader.trouble(u'ERROR: unable to extract video title')
1256                         return
1257                 video_title = urllib.unquote_plus(video_info['title'][0])
1258                 video_title = video_title.decode('utf-8')
1259                 video_title = sanitize_title(video_title)
1260
1261                 # simplified title
1262                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1263                 simple_title = simple_title.strip(ur'_')
1264
1265                 # thumbnail image
1266                 if 'thumbnail_url' not in video_info:
1267                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1268                         video_thumbnail = ''
1269                 else:   # don't panic if we can't find it
1270                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1271
1272                 # upload date
1273                 upload_date = u'NA'
1274                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1275                 if mobj is not None:
1276                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1277                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1278                         for expression in format_expressions:
1279                                 try:
1280                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1281                                 except:
1282                                         pass
1283
1284                 # description
1285                 try:
1286                         lxml.etree
1287                 except NameError:
1288                         video_description = u'No description available.'
1289                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1290                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1291                                 if mobj is not None:
1292                                         video_description = mobj.group(1).decode('utf-8')
1293                 else:
1294                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1295                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1296                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1297                         # TODO use another parser
1298
1299                 # token
1300                 video_token = urllib.unquote_plus(video_info['token'][0])
1301
1302                 # Decide which formats to download
1303                 req_format = self._downloader.params.get('format', None)
1304
1305                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1306                         self.report_rtmp_download()
1307                         video_url_list = [(None, video_info['conn'][0])]
1308                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1309                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1310                         url_data = [parse_qs(uds) for uds in url_data_strs]
1311                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1312                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1313
1314                         format_limit = self._downloader.params.get('format_limit', None)
1315                         if format_limit is not None and format_limit in self._available_formats:
1316                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1317                         else:
1318                                 format_list = self._available_formats
1319                         existing_formats = [x for x in format_list if x in url_map]
1320                         if len(existing_formats) == 0:
1321                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1322                                 return
1323                         if req_format is None or req_format == 'best':
1324                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1325                         elif req_format == 'worst':
1326                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1327                         elif req_format in ('-1', 'all'):
1328                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1329                         else:
1330                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1331                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1332                                 req_formats = req_format.split('/')
1333                                 video_url_list = None
1334                                 for rf in req_formats:
1335                                         if rf in url_map:
1336                                                 video_url_list = [(rf, url_map[rf])]
1337                                                 break
1338                                 if video_url_list is None:
1339                                         self._downloader.trouble(u'ERROR: requested format not available')
1340                                         return
1341                 else:
1342                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1343                         return
1344
1345                 for format_param, video_real_url in video_url_list:
1346                         # At this point we have a new video
1347                         self._downloader.increment_downloads()
1348
1349                         # Extension
1350                         video_extension = self._video_extensions.get(format_param, 'flv')
1351
1352                         try:
1353                                 # Process video information
1354                                 self._downloader.process_info({
1355                                         'id':           video_id.decode('utf-8'),
1356                                         'url':          video_real_url.decode('utf-8'),
1357                                         'uploader':     video_uploader.decode('utf-8'),
1358                                         'upload_date':  upload_date,
1359                                         'title':        video_title,
1360                                         'stitle':       simple_title,
1361                                         'ext':          video_extension.decode('utf-8'),
1362                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1363                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1364                                         'description':  video_description,
1365                                         'player_url':   player_url,
1366                                 })
1367                         except UnavailableVideoError, err:
1368                                 self._downloader.trouble(u'\nERROR: unable to download video')
1369
1370
1371 class MetacafeIE(InfoExtractor):
1372         """Information Extractor for metacafe.com."""
1373
1374         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1375         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1376         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1377         _youtube_ie = None
1378         IE_NAME = u'metacafe'
1379
1380         def __init__(self, youtube_ie, downloader=None):
1381                 InfoExtractor.__init__(self, downloader)
1382                 self._youtube_ie = youtube_ie
1383
1384         def report_disclaimer(self):
1385                 """Report disclaimer retrieval."""
1386                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1387
1388         def report_age_confirmation(self):
1389                 """Report attempt to confirm age."""
1390                 self._downloader.to_screen(u'[metacafe] Confirming age')
1391
1392         def report_download_webpage(self, video_id):
1393                 """Report webpage download."""
1394                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1395
1396         def report_extraction(self, video_id):
1397                 """Report information extraction."""
1398                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1399
1400         def _real_initialize(self):
1401                 # Retrieve disclaimer
1402                 request = urllib2.Request(self._DISCLAIMER)
1403                 try:
1404                         self.report_disclaimer()
1405                         disclaimer = urllib2.urlopen(request).read()
1406                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1407                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1408                         return
1409
1410                 # Confirm age
1411                 disclaimer_form = {
1412                         'filters': '0',
1413                         'submit': "Continue - I'm over 18",
1414                         }
1415                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1416                 try:
1417                         self.report_age_confirmation()
1418                         disclaimer = urllib2.urlopen(request).read()
1419                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1420                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1421                         return
1422
1423         def _real_extract(self, url):
1424                 # Extract id and simplified title from URL
1425                 mobj = re.match(self._VALID_URL, url)
1426                 if mobj is None:
1427                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1428                         return
1429
1430                 video_id = mobj.group(1)
1431
1432                 # Check if video comes from YouTube
1433                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1434                 if mobj2 is not None:
1435                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1436                         return
1437
1438                 # At this point we have a new video
1439                 self._downloader.increment_downloads()
1440
1441                 simple_title = mobj.group(2).decode('utf-8')
1442
1443                 # Retrieve video webpage to extract further information
1444                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1445                 try:
1446                         self.report_download_webpage(video_id)
1447                         webpage = urllib2.urlopen(request).read()
1448                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1449                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1450                         return
1451
1452                 # Extract URL, uploader and title from webpage
1453                 self.report_extraction(video_id)
1454                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1455                 if mobj is not None:
1456                         mediaURL = urllib.unquote(mobj.group(1))
1457                         video_extension = mediaURL[-3:]
1458
1459                         # Extract gdaKey if available
1460                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1461                         if mobj is None:
1462                                 video_url = mediaURL
1463                         else:
1464                                 gdaKey = mobj.group(1)
1465                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1466                 else:
1467                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1468                         if mobj is None:
1469                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1470                                 return
1471                         vardict = parse_qs(mobj.group(1))
1472                         if 'mediaData' not in vardict:
1473                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1474                                 return
1475                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1476                         if mobj is None:
1477                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1478                                 return
1479                         mediaURL = mobj.group(1).replace('\\/', '/')
1480                         video_extension = mediaURL[-3:]
1481                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1482
1483                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1484                 if mobj is None:
1485                         self._downloader.trouble(u'ERROR: unable to extract title')
1486                         return
1487                 video_title = mobj.group(1).decode('utf-8')
1488                 video_title = sanitize_title(video_title)
1489
1490                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1491                 if mobj is None:
1492                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1493                         return
1494                 video_uploader = mobj.group(1)
1495
1496                 try:
1497                         # Process video information
1498                         self._downloader.process_info({
1499                                 'id':           video_id.decode('utf-8'),
1500                                 'url':          video_url.decode('utf-8'),
1501                                 'uploader':     video_uploader.decode('utf-8'),
1502                                 'upload_date':  u'NA',
1503                                 'title':        video_title,
1504                                 'stitle':       simple_title,
1505                                 'ext':          video_extension.decode('utf-8'),
1506                                 'format':       u'NA',
1507                                 'player_url':   None,
1508                         })
1509                 except UnavailableVideoError:
1510                         self._downloader.trouble(u'\nERROR: unable to download video')
1511
1512
1513 class DailymotionIE(InfoExtractor):
1514         """Information Extractor for Dailymotion"""
1515
1516         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1517         IE_NAME = u'dailymotion'
1518
1519         def __init__(self, downloader=None):
1520                 InfoExtractor.__init__(self, downloader)
1521
1522         def report_download_webpage(self, video_id):
1523                 """Report webpage download."""
1524                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1525
1526         def report_extraction(self, video_id):
1527                 """Report information extraction."""
1528                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1529
1530         def _real_initialize(self):
1531                 return
1532
1533         def _real_extract(self, url):
1534                 # Extract id and simplified title from URL
1535                 mobj = re.match(self._VALID_URL, url)
1536                 if mobj is None:
1537                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1538                         return
1539
1540                 # At this point we have a new video
1541                 self._downloader.increment_downloads()
1542                 video_id = mobj.group(1)
1543
1544                 simple_title = mobj.group(2).decode('utf-8')
1545                 video_extension = 'flv'
1546
1547                 # Retrieve video webpage to extract further information
1548                 request = urllib2.Request(url)
1549                 request.add_header('Cookie', 'family_filter=off')
1550                 try:
1551                         self.report_download_webpage(video_id)
1552                         webpage = urllib2.urlopen(request).read()
1553                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1554                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1555                         return
1556
1557                 # Extract URL, uploader and title from webpage
1558                 self.report_extraction(video_id)
1559                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1560                 if mobj is None:
1561                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1562                         return
1563                 sequence = urllib.unquote(mobj.group(1))
1564                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1565                 if mobj is None:
1566                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1567                         return
1568                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1569
1570                 # if needed add http://www.dailymotion.com/ if relative URL
1571
1572                 video_url = mediaURL
1573
1574                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1575                 if mobj is None:
1576                         self._downloader.trouble(u'ERROR: unable to extract title')
1577                         return
1578                 video_title = mobj.group(1).decode('utf-8')
1579                 video_title = sanitize_title(video_title)
1580
1581                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1582                 if mobj is None:
1583                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1584                         return
1585                 video_uploader = mobj.group(1)
1586
1587                 try:
1588                         # Process video information
1589                         self._downloader.process_info({
1590                                 'id':           video_id.decode('utf-8'),
1591                                 'url':          video_url.decode('utf-8'),
1592                                 'uploader':     video_uploader.decode('utf-8'),
1593                                 'upload_date':  u'NA',
1594                                 'title':        video_title,
1595                                 'stitle':       simple_title,
1596                                 'ext':          video_extension.decode('utf-8'),
1597                                 'format':       u'NA',
1598                                 'player_url':   None,
1599                         })
1600                 except UnavailableVideoError:
1601                         self._downloader.trouble(u'\nERROR: unable to download video')
1602
1603
1604 class GoogleIE(InfoExtractor):
1605         """Information extractor for video.google.com."""
1606
1607         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1608         IE_NAME = u'video.google'
1609
1610         def __init__(self, downloader=None):
1611                 InfoExtractor.__init__(self, downloader)
1612
1613         def report_download_webpage(self, video_id):
1614                 """Report webpage download."""
1615                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1616
1617         def report_extraction(self, video_id):
1618                 """Report information extraction."""
1619                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1620
1621         def _real_initialize(self):
1622                 return
1623
1624         def _real_extract(self, url):
1625                 # Extract id from URL
1626                 mobj = re.match(self._VALID_URL, url)
1627                 if mobj is None:
1628                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1629                         return
1630
1631                 # At this point we have a new video
1632                 self._downloader.increment_downloads()
1633                 video_id = mobj.group(1)
1634
1635                 video_extension = 'mp4'
1636
1637                 # Retrieve video webpage to extract further information
1638                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1639                 try:
1640                         self.report_download_webpage(video_id)
1641                         webpage = urllib2.urlopen(request).read()
1642                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1643                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1644                         return
1645
1646                 # Extract URL, uploader, and title from webpage
1647                 self.report_extraction(video_id)
1648                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1649                 if mobj is None:
1650                         video_extension = 'flv'
1651                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1652                 if mobj is None:
1653                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1654                         return
1655                 mediaURL = urllib.unquote(mobj.group(1))
1656                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1657                 mediaURL = mediaURL.replace('\\x26', '\x26')
1658
1659                 video_url = mediaURL
1660
1661                 mobj = re.search(r'<title>(.*)</title>', webpage)
1662                 if mobj is None:
1663                         self._downloader.trouble(u'ERROR: unable to extract title')
1664                         return
1665                 video_title = mobj.group(1).decode('utf-8')
1666                 video_title = sanitize_title(video_title)
1667                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1668
1669                 # Extract video description
1670                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1671                 if mobj is None:
1672                         self._downloader.trouble(u'ERROR: unable to extract video description')
1673                         return
1674                 video_description = mobj.group(1).decode('utf-8')
1675                 if not video_description:
1676                         video_description = 'No description available.'
1677
1678                 # Extract video thumbnail
1679                 if self._downloader.params.get('forcethumbnail', False):
1680                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1681                         try:
1682                                 webpage = urllib2.urlopen(request).read()
1683                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1684                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1685                                 return
1686                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1687                         if mobj is None:
1688                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1689                                 return
1690                         video_thumbnail = mobj.group(1)
1691                 else:   # we need something to pass to process_info
1692                         video_thumbnail = ''
1693
1694                 try:
1695                         # Process video information
1696                         self._downloader.process_info({
1697                                 'id':           video_id.decode('utf-8'),
1698                                 'url':          video_url.decode('utf-8'),
1699                                 'uploader':     u'NA',
1700                                 'upload_date':  u'NA',
1701                                 'title':        video_title,
1702                                 'stitle':       simple_title,
1703                                 'ext':          video_extension.decode('utf-8'),
1704                                 'format':       u'NA',
1705                                 'player_url':   None,
1706                         })
1707                 except UnavailableVideoError:
1708                         self._downloader.trouble(u'\nERROR: unable to download video')
1709
1710
1711 class PhotobucketIE(InfoExtractor):
1712         """Information extractor for photobucket.com."""
1713
1714         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1715         IE_NAME = u'photobucket'
1716
1717         def __init__(self, downloader=None):
1718                 InfoExtractor.__init__(self, downloader)
1719
1720         def report_download_webpage(self, video_id):
1721                 """Report webpage download."""
1722                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1723
1724         def report_extraction(self, video_id):
1725                 """Report information extraction."""
1726                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1727
1728         def _real_initialize(self):
1729                 return
1730
1731         def _real_extract(self, url):
1732                 # Extract id from URL
1733                 mobj = re.match(self._VALID_URL, url)
1734                 if mobj is None:
1735                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1736                         return
1737
1738                 # At this point we have a new video
1739                 self._downloader.increment_downloads()
1740                 video_id = mobj.group(1)
1741
1742                 video_extension = 'flv'
1743
1744                 # Retrieve video webpage to extract further information
1745                 request = urllib2.Request(url)
1746                 try:
1747                         self.report_download_webpage(video_id)
1748                         webpage = urllib2.urlopen(request).read()
1749                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1750                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1751                         return
1752
1753                 # Extract URL, uploader, and title from webpage
1754                 self.report_extraction(video_id)
1755                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1756                 if mobj is None:
1757                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1758                         return
1759                 mediaURL = urllib.unquote(mobj.group(1))
1760
1761                 video_url = mediaURL
1762
1763                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1764                 if mobj is None:
1765                         self._downloader.trouble(u'ERROR: unable to extract title')
1766                         return
1767                 video_title = mobj.group(1).decode('utf-8')
1768                 video_title = sanitize_title(video_title)
1769                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1770
1771                 video_uploader = mobj.group(2).decode('utf-8')
1772
1773                 try:
1774                         # Process video information
1775                         self._downloader.process_info({
1776                                 'id':           video_id.decode('utf-8'),
1777                                 'url':          video_url.decode('utf-8'),
1778                                 'uploader':     video_uploader,
1779                                 'upload_date':  u'NA',
1780                                 'title':        video_title,
1781                                 'stitle':       simple_title,
1782                                 'ext':          video_extension.decode('utf-8'),
1783                                 'format':       u'NA',
1784                                 'player_url':   None,
1785                         })
1786                 except UnavailableVideoError:
1787                         self._downloader.trouble(u'\nERROR: unable to download video')
1788
1789
1790 class YahooIE(InfoExtractor):
1791         """Information extractor for video.yahoo.com."""
1792
1793         # _VALID_URL matches all Yahoo! Video URLs
1794         # _VPAGE_URL matches only the extractable '/watch/' URLs
1795         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1796         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1797         IE_NAME = u'video.yahoo'
1798
1799         def __init__(self, downloader=None):
1800                 InfoExtractor.__init__(self, downloader)
1801
1802         def report_download_webpage(self, video_id):
1803                 """Report webpage download."""
1804                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1805
1806         def report_extraction(self, video_id):
1807                 """Report information extraction."""
1808                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1809
1810         def _real_initialize(self):
1811                 return
1812
1813         def _real_extract(self, url, new_video=True):
1814                 # Extract ID from URL
1815                 mobj = re.match(self._VALID_URL, url)
1816                 if mobj is None:
1817                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1818                         return
1819
1820                 # At this point we have a new video
1821                 self._downloader.increment_downloads()
1822                 video_id = mobj.group(2)
1823                 video_extension = 'flv'
1824
1825                 # Rewrite valid but non-extractable URLs as
1826                 # extractable English language /watch/ URLs
1827                 if re.match(self._VPAGE_URL, url) is None:
1828                         request = urllib2.Request(url)
1829                         try:
1830                                 webpage = urllib2.urlopen(request).read()
1831                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1832                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1833                                 return
1834
1835                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1836                         if mobj is None:
1837                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1838                                 return
1839                         yahoo_id = mobj.group(1)
1840
1841                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1842                         if mobj is None:
1843                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1844                                 return
1845                         yahoo_vid = mobj.group(1)
1846
1847                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1848                         return self._real_extract(url, new_video=False)
1849
1850                 # Retrieve video webpage to extract further information
1851                 request = urllib2.Request(url)
1852                 try:
1853                         self.report_download_webpage(video_id)
1854                         webpage = urllib2.urlopen(request).read()
1855                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1856                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1857                         return
1858
1859                 # Extract uploader and title from webpage
1860                 self.report_extraction(video_id)
1861                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1862                 if mobj is None:
1863                         self._downloader.trouble(u'ERROR: unable to extract video title')
1864                         return
1865                 video_title = mobj.group(1).decode('utf-8')
1866                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1867
1868                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1869                 if mobj is None:
1870                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1871                         return
1872                 video_uploader = mobj.group(1).decode('utf-8')
1873
1874                 # Extract video thumbnail
1875                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1876                 if mobj is None:
1877                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1878                         return
1879                 video_thumbnail = mobj.group(1).decode('utf-8')
1880
1881                 # Extract video description
1882                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1883                 if mobj is None:
1884                         self._downloader.trouble(u'ERROR: unable to extract video description')
1885                         return
1886                 video_description = mobj.group(1).decode('utf-8')
1887                 if not video_description:
1888                         video_description = 'No description available.'
1889
1890                 # Extract video height and width
1891                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1892                 if mobj is None:
1893                         self._downloader.trouble(u'ERROR: unable to extract video height')
1894                         return
1895                 yv_video_height = mobj.group(1)
1896
1897                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1898                 if mobj is None:
1899                         self._downloader.trouble(u'ERROR: unable to extract video width')
1900                         return
1901                 yv_video_width = mobj.group(1)
1902
1903                 # Retrieve video playlist to extract media URL
1904                 # I'm not completely sure what all these options are, but we
1905                 # seem to need most of them, otherwise the server sends a 401.
1906                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1907                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1908                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1909                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1910                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1911                 try:
1912                         self.report_download_webpage(video_id)
1913                         webpage = urllib2.urlopen(request).read()
1914                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1915                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1916                         return
1917
1918                 # Extract media URL from playlist XML
1919                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1920                 if mobj is None:
1921                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1922                         return
1923                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1924                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1925
1926                 try:
1927                         # Process video information
1928                         self._downloader.process_info({
1929                                 'id':           video_id.decode('utf-8'),
1930                                 'url':          video_url,
1931                                 'uploader':     video_uploader,
1932                                 'upload_date':  u'NA',
1933                                 'title':        video_title,
1934                                 'stitle':       simple_title,
1935                                 'ext':          video_extension.decode('utf-8'),
1936                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1937                                 'description':  video_description,
1938                                 'thumbnail':    video_thumbnail,
1939                                 'player_url':   None,
1940                         })
1941                 except UnavailableVideoError:
1942                         self._downloader.trouble(u'\nERROR: unable to download video')
1943
1944
1945 class VimeoIE(InfoExtractor):
1946         """Information extractor for vimeo.com."""
1947
1948         # _VALID_URL matches Vimeo URLs
1949         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1950         IE_NAME = u'vimeo'
1951
1952         def __init__(self, downloader=None):
1953                 InfoExtractor.__init__(self, downloader)
1954
1955         def report_download_webpage(self, video_id):
1956                 """Report webpage download."""
1957                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1958
1959         def report_extraction(self, video_id):
1960                 """Report information extraction."""
1961                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1962
1963         def _real_initialize(self):
1964                 return
1965
1966         def _real_extract(self, url, new_video=True):
1967                 # Extract ID from URL
1968                 mobj = re.match(self._VALID_URL, url)
1969                 if mobj is None:
1970                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1971                         return
1972
1973                 # At this point we have a new video
1974                 self._downloader.increment_downloads()
1975                 video_id = mobj.group(1)
1976
1977                 # Retrieve video webpage to extract further information
1978                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1979                 try:
1980                         self.report_download_webpage(video_id)
1981                         webpage = urllib2.urlopen(request).read()
1982                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1983                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1984                         return
1985
1986                 # Now we begin extracting as much information as we can from what we
1987                 # retrieved. First we extract the information common to all extractors,
1988                 # and latter we extract those that are Vimeo specific.
1989                 self.report_extraction(video_id)
1990
1991                 # Extract title
1992                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1993                 if mobj is None:
1994                         self._downloader.trouble(u'ERROR: unable to extract video title')
1995                         return
1996                 video_title = mobj.group(1).decode('utf-8')
1997                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1998
1999                 # Extract uploader
2000                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2001                 if mobj is None:
2002                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2003                         return
2004                 video_uploader = mobj.group(1).decode('utf-8')
2005
2006                 # Extract video thumbnail
2007                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2008                 if mobj is None:
2009                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2010                         return
2011                 video_thumbnail = mobj.group(1).decode('utf-8')
2012
2013                 # # Extract video description
2014                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2015                 # if mobj is None:
2016                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2017                 #       return
2018                 # video_description = mobj.group(1).decode('utf-8')
2019                 # if not video_description: video_description = 'No description available.'
2020                 video_description = 'Foo.'
2021
2022                 # Vimeo specific: extract request signature
2023                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2024                 if mobj is None:
2025                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2026                         return
2027                 sig = mobj.group(1).decode('utf-8')
2028
2029                 # Vimeo specific: Extract request signature expiration
2030                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2031                 if mobj is None:
2032                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2033                         return
2034                 sig_exp = mobj.group(1).decode('utf-8')
2035
2036                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2037
2038                 try:
2039                         # Process video information
2040                         self._downloader.process_info({
2041                                 'id':           video_id.decode('utf-8'),
2042                                 'url':          video_url,
2043                                 'uploader':     video_uploader,
2044                                 'upload_date':  u'NA',
2045                                 'title':        video_title,
2046                                 'stitle':       simple_title,
2047                                 'ext':          u'mp4',
2048                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2049                                 'description':  video_description,
2050                                 'thumbnail':    video_thumbnail,
2051                                 'description':  video_description,
2052                                 'player_url':   None,
2053                         })
2054                 except UnavailableVideoError:
2055                         self._downloader.trouble(u'ERROR: unable to download video')
2056
2057
2058 class GenericIE(InfoExtractor):
2059         """Generic last-resort information extractor."""
2060
2061         _VALID_URL = r'.*'
2062         IE_NAME = u'generic'
2063
2064         def __init__(self, downloader=None):
2065                 InfoExtractor.__init__(self, downloader)
2066
2067         def report_download_webpage(self, video_id):
2068                 """Report webpage download."""
2069                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2070                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2071
2072         def report_extraction(self, video_id):
2073                 """Report information extraction."""
2074                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2075
2076         def _real_initialize(self):
2077                 return
2078
2079         def _real_extract(self, url):
2080                 # At this point we have a new video
2081                 self._downloader.increment_downloads()
2082
2083                 video_id = url.split('/')[-1]
2084                 request = urllib2.Request(url)
2085                 try:
2086                         self.report_download_webpage(video_id)
2087                         webpage = urllib2.urlopen(request).read()
2088                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2089                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2090                         return
2091                 except ValueError, err:
2092                         # since this is the last-resort InfoExtractor, if
2093                         # this error is thrown, it'll be thrown here
2094                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2095                         return
2096
2097                 self.report_extraction(video_id)
2098                 # Start with something easy: JW Player in SWFObject
2099                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2100                 if mobj is None:
2101                         # Broaden the search a little bit
2102                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2103                 if mobj is None:
2104                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2105                         return
2106
2107                 # It's possible that one of the regexes
2108                 # matched, but returned an empty group:
2109                 if mobj.group(1) is None:
2110                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2111                         return
2112
2113                 video_url = urllib.unquote(mobj.group(1))
2114                 video_id = os.path.basename(video_url)
2115
2116                 # here's a fun little line of code for you:
2117                 video_extension = os.path.splitext(video_id)[1][1:]
2118                 video_id = os.path.splitext(video_id)[0]
2119
2120                 # it's tempting to parse this further, but you would
2121                 # have to take into account all the variations like
2122                 #   Video Title - Site Name
2123                 #   Site Name | Video Title
2124                 #   Video Title - Tagline | Site Name
2125                 # and so on and so forth; it's just not practical
2126                 mobj = re.search(r'<title>(.*)</title>', webpage)
2127                 if mobj is None:
2128                         self._downloader.trouble(u'ERROR: unable to extract title')
2129                         return
2130                 video_title = mobj.group(1).decode('utf-8')
2131                 video_title = sanitize_title(video_title)
2132                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2133
2134                 # video uploader is domain name
2135                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2136                 if mobj is None:
2137                         self._downloader.trouble(u'ERROR: unable to extract title')
2138                         return
2139                 video_uploader = mobj.group(1).decode('utf-8')
2140
2141                 try:
2142                         # Process video information
2143                         self._downloader.process_info({
2144                                 'id':           video_id.decode('utf-8'),
2145                                 'url':          video_url.decode('utf-8'),
2146                                 'uploader':     video_uploader,
2147                                 'upload_date':  u'NA',
2148                                 'title':        video_title,
2149                                 'stitle':       simple_title,
2150                                 'ext':          video_extension.decode('utf-8'),
2151                                 'format':       u'NA',
2152                                 'player_url':   None,
2153                         })
2154                 except UnavailableVideoError, err:
2155                         self._downloader.trouble(u'\nERROR: unable to download video')
2156
2157
2158 class YoutubeSearchIE(InfoExtractor):
2159         """Information Extractor for YouTube search queries."""
2160         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2161         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2162         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2163         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2164         _youtube_ie = None
2165         _max_youtube_results = 1000
2166         IE_NAME = u'youtube:search'
2167
2168         def __init__(self, youtube_ie, downloader=None):
2169                 InfoExtractor.__init__(self, downloader)
2170                 self._youtube_ie = youtube_ie
2171
2172         def report_download_page(self, query, pagenum):
2173                 """Report attempt to download playlist page with given number."""
2174                 query = query.decode(preferredencoding())
2175                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2176
2177         def _real_initialize(self):
2178                 self._youtube_ie.initialize()
2179
2180         def _real_extract(self, query):
2181                 mobj = re.match(self._VALID_URL, query)
2182                 if mobj is None:
2183                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2184                         return
2185
2186                 prefix, query = query.split(':')
2187                 prefix = prefix[8:]
2188                 query = query.encode('utf-8')
2189                 if prefix == '':
2190                         self._download_n_results(query, 1)
2191                         return
2192                 elif prefix == 'all':
2193                         self._download_n_results(query, self._max_youtube_results)
2194                         return
2195                 else:
2196                         try:
2197                                 n = long(prefix)
2198                                 if n <= 0:
2199                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2200                                         return
2201                                 elif n > self._max_youtube_results:
2202                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2203                                         n = self._max_youtube_results
2204                                 self._download_n_results(query, n)
2205                                 return
2206                         except ValueError: # parsing prefix as integer fails
2207                                 self._download_n_results(query, 1)
2208                                 return
2209
2210         def _download_n_results(self, query, n):
2211                 """Downloads a specified number of results for a query"""
2212
2213                 video_ids = []
2214                 already_seen = set()
2215                 pagenum = 1
2216
2217                 while True:
2218                         self.report_download_page(query, pagenum)
2219                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2220                         request = urllib2.Request(result_url)
2221                         try:
2222                                 page = urllib2.urlopen(request).read()
2223                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2224                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2225                                 return
2226
2227                         # Extract video identifiers
2228                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2229                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2230                                 if video_id not in already_seen:
2231                                         video_ids.append(video_id)
2232                                         already_seen.add(video_id)
2233                                         if len(video_ids) == n:
2234                                                 # Specified n videos reached
2235                                                 for id in video_ids:
2236                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2237                                                 return
2238
2239                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2240                                 for id in video_ids:
2241                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2242                                 return
2243
2244                         pagenum = pagenum + 1
2245
2246
2247 class GoogleSearchIE(InfoExtractor):
2248         """Information Extractor for Google Video search queries."""
2249         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2250         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2251         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2252         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2253         _google_ie = None
2254         _max_google_results = 1000
2255         IE_NAME = u'video.google:search'
2256
2257         def __init__(self, google_ie, downloader=None):
2258                 InfoExtractor.__init__(self, downloader)
2259                 self._google_ie = google_ie
2260
2261         def report_download_page(self, query, pagenum):
2262                 """Report attempt to download playlist page with given number."""
2263                 query = query.decode(preferredencoding())
2264                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2265
2266         def _real_initialize(self):
2267                 self._google_ie.initialize()
2268
2269         def _real_extract(self, query):
2270                 mobj = re.match(self._VALID_URL, query)
2271                 if mobj is None:
2272                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2273                         return
2274
2275                 prefix, query = query.split(':')
2276                 prefix = prefix[8:]
2277                 query = query.encode('utf-8')
2278                 if prefix == '':
2279                         self._download_n_results(query, 1)
2280                         return
2281                 elif prefix == 'all':
2282                         self._download_n_results(query, self._max_google_results)
2283                         return
2284                 else:
2285                         try:
2286                                 n = long(prefix)
2287                                 if n <= 0:
2288                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2289                                         return
2290                                 elif n > self._max_google_results:
2291                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2292                                         n = self._max_google_results
2293                                 self._download_n_results(query, n)
2294                                 return
2295                         except ValueError: # parsing prefix as integer fails
2296                                 self._download_n_results(query, 1)
2297                                 return
2298
2299         def _download_n_results(self, query, n):
2300                 """Downloads a specified number of results for a query"""
2301
2302                 video_ids = []
2303                 already_seen = set()
2304                 pagenum = 1
2305
2306                 while True:
2307                         self.report_download_page(query, pagenum)
2308                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2309                         request = urllib2.Request(result_url)
2310                         try:
2311                                 page = urllib2.urlopen(request).read()
2312                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2313                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2314                                 return
2315
2316                         # Extract video identifiers
2317                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2318                                 video_id = mobj.group(1)
2319                                 if video_id not in already_seen:
2320                                         video_ids.append(video_id)
2321                                         already_seen.add(video_id)
2322                                         if len(video_ids) == n:
2323                                                 # Specified n videos reached
2324                                                 for id in video_ids:
2325                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2326                                                 return
2327
2328                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2329                                 for id in video_ids:
2330                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2331                                 return
2332
2333                         pagenum = pagenum + 1
2334
2335
2336 class YahooSearchIE(InfoExtractor):
2337         """Information Extractor for Yahoo! Video search queries."""
2338         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2339         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2340         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2341         _MORE_PAGES_INDICATOR = r'\s*Next'
2342         _yahoo_ie = None
2343         _max_yahoo_results = 1000
2344         IE_NAME = u'video.yahoo:search'
2345
2346         def __init__(self, yahoo_ie, downloader=None):
2347                 InfoExtractor.__init__(self, downloader)
2348                 self._yahoo_ie = yahoo_ie
2349
2350         def report_download_page(self, query, pagenum):
2351                 """Report attempt to download playlist page with given number."""
2352                 query = query.decode(preferredencoding())
2353                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2354
2355         def _real_initialize(self):
2356                 self._yahoo_ie.initialize()
2357
2358         def _real_extract(self, query):
2359                 mobj = re.match(self._VALID_URL, query)
2360                 if mobj is None:
2361                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2362                         return
2363
2364                 prefix, query = query.split(':')
2365                 prefix = prefix[8:]
2366                 query = query.encode('utf-8')
2367                 if prefix == '':
2368                         self._download_n_results(query, 1)
2369                         return
2370                 elif prefix == 'all':
2371                         self._download_n_results(query, self._max_yahoo_results)
2372                         return
2373                 else:
2374                         try:
2375                                 n = long(prefix)
2376                                 if n <= 0:
2377                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2378                                         return
2379                                 elif n > self._max_yahoo_results:
2380                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2381                                         n = self._max_yahoo_results
2382                                 self._download_n_results(query, n)
2383                                 return
2384                         except ValueError: # parsing prefix as integer fails
2385                                 self._download_n_results(query, 1)
2386                                 return
2387
2388         def _download_n_results(self, query, n):
2389                 """Downloads a specified number of results for a query"""
2390
2391                 video_ids = []
2392                 already_seen = set()
2393                 pagenum = 1
2394
2395                 while True:
2396                         self.report_download_page(query, pagenum)
2397                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2398                         request = urllib2.Request(result_url)
2399                         try:
2400                                 page = urllib2.urlopen(request).read()
2401                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2402                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2403                                 return
2404
2405                         # Extract video identifiers
2406                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2407                                 video_id = mobj.group(1)
2408                                 if video_id not in already_seen:
2409                                         video_ids.append(video_id)
2410                                         already_seen.add(video_id)
2411                                         if len(video_ids) == n:
2412                                                 # Specified n videos reached
2413                                                 for id in video_ids:
2414                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2415                                                 return
2416
2417                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2418                                 for id in video_ids:
2419                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2420                                 return
2421
2422                         pagenum = pagenum + 1
2423
2424
2425 class YoutubePlaylistIE(InfoExtractor):
2426         """Information Extractor for YouTube playlists."""
2427
2428         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2429         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2430         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2431         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2432         _youtube_ie = None
2433         IE_NAME = u'youtube:playlist'
2434
2435         def __init__(self, youtube_ie, downloader=None):
2436                 InfoExtractor.__init__(self, downloader)
2437                 self._youtube_ie = youtube_ie
2438
2439         def report_download_page(self, playlist_id, pagenum):
2440                 """Report attempt to download playlist page with given number."""
2441                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2442
2443         def _real_initialize(self):
2444                 self._youtube_ie.initialize()
2445
2446         def _real_extract(self, url):
2447                 # Extract playlist id
2448                 mobj = re.match(self._VALID_URL, url)
2449                 if mobj is None:
2450                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2451                         return
2452
2453                 # Single video case
2454                 if mobj.group(3) is not None:
2455                         self._youtube_ie.extract(mobj.group(3))
2456                         return
2457
2458                 # Download playlist pages
2459                 # prefix is 'p' as default for playlists but there are other types that need extra care
2460                 playlist_prefix = mobj.group(1)
2461                 if playlist_prefix == 'a':
2462                         playlist_access = 'artist'
2463                 else:
2464                         playlist_prefix = 'p'
2465                         playlist_access = 'view_play_list'
2466                 playlist_id = mobj.group(2)
2467                 video_ids = []
2468                 pagenum = 1
2469
2470                 while True:
2471                         self.report_download_page(playlist_id, pagenum)
2472                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2473                         try:
2474                                 page = urllib2.urlopen(request).read()
2475                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2476                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2477                                 return
2478
2479                         # Extract video identifiers
2480                         ids_in_page = []
2481                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2482                                 if mobj.group(1) not in ids_in_page:
2483                                         ids_in_page.append(mobj.group(1))
2484                         video_ids.extend(ids_in_page)
2485
2486                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2487                                 break
2488                         pagenum = pagenum + 1
2489
2490                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2491                 playlistend = self._downloader.params.get('playlistend', -1)
2492                 video_ids = video_ids[playliststart:playlistend]
2493
2494                 for id in video_ids:
2495                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2496                 return
2497
2498
2499 class YoutubeUserIE(InfoExtractor):
2500         """Information Extractor for YouTube users."""
2501
2502         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2503         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2504         _GDATA_PAGE_SIZE = 50
2505         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2506         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2507         _youtube_ie = None
2508         IE_NAME = u'youtube:user'
2509
2510         def __init__(self, youtube_ie, downloader=None):
2511                 InfoExtractor.__init__(self, downloader)
2512                 self._youtube_ie = youtube_ie
2513
2514         def report_download_page(self, username, start_index):
2515                 """Report attempt to download user page."""
2516                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2517                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2518
2519         def _real_initialize(self):
2520                 self._youtube_ie.initialize()
2521
2522         def _real_extract(self, url):
2523                 # Extract username
2524                 mobj = re.match(self._VALID_URL, url)
2525                 if mobj is None:
2526                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2527                         return
2528
2529                 username = mobj.group(1)
2530
2531                 # Download video ids using YouTube Data API. Result size per
2532                 # query is limited (currently to 50 videos) so we need to query
2533                 # page by page until there are no video ids - it means we got
2534                 # all of them.
2535
2536                 video_ids = []
2537                 pagenum = 0
2538
2539                 while True:
2540                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2541                         self.report_download_page(username, start_index)
2542
2543                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2544
2545                         try:
2546                                 page = urllib2.urlopen(request).read()
2547                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2548                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2549                                 return
2550
2551                         # Extract video identifiers
2552                         ids_in_page = []
2553
2554                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2555                                 if mobj.group(1) not in ids_in_page:
2556                                         ids_in_page.append(mobj.group(1))
2557
2558                         video_ids.extend(ids_in_page)
2559
2560                         # A little optimization - if current page is not
2561                         # "full", ie. does not contain PAGE_SIZE video ids then
2562                         # we can assume that this page is the last one - there
2563                         # are no more ids on further pages - no need to query
2564                         # again.
2565
2566                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2567                                 break
2568
2569                         pagenum += 1
2570
2571                 all_ids_count = len(video_ids)
2572                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2573                 playlistend = self._downloader.params.get('playlistend', -1)
2574
2575                 if playlistend == -1:
2576                         video_ids = video_ids[playliststart:]
2577                 else:
2578                         video_ids = video_ids[playliststart:playlistend]
2579
2580                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2581                                 (username, all_ids_count, len(video_ids)))
2582
2583                 for video_id in video_ids:
2584                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2585
2586
2587 class DepositFilesIE(InfoExtractor):
2588         """Information extractor for depositfiles.com"""
2589
2590         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2591         IE_NAME = u'DepositFiles'
2592
2593         def __init__(self, downloader=None):
2594                 InfoExtractor.__init__(self, downloader)
2595
2596         def report_download_webpage(self, file_id):
2597                 """Report webpage download."""
2598                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2599
2600         def report_extraction(self, file_id):
2601                 """Report information extraction."""
2602                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2603
2604         def _real_initialize(self):
2605                 return
2606
2607         def _real_extract(self, url):
2608                 # At this point we have a new file
2609                 self._downloader.increment_downloads()
2610
2611                 file_id = url.split('/')[-1]
2612                 # Rebuild url in english locale
2613                 url = 'http://depositfiles.com/en/files/' + file_id
2614
2615                 # Retrieve file webpage with 'Free download' button pressed
2616                 free_download_indication = { 'gateway_result' : '1' }
2617                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2618                 try:
2619                         self.report_download_webpage(file_id)
2620                         webpage = urllib2.urlopen(request).read()
2621                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2622                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2623                         return
2624
2625                 # Search for the real file URL
2626                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2627                 if (mobj is None) or (mobj.group(1) is None):
2628                         # Try to figure out reason of the error.
2629                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2630                         if (mobj is not None) and (mobj.group(1) is not None):
2631                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2632                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2633                         else:
2634                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2635                         return
2636
2637                 file_url = mobj.group(1)
2638                 file_extension = os.path.splitext(file_url)[1][1:]
2639
2640                 # Search for file title
2641                 mobj = re.search(r'<b title="(.*?)">', webpage)
2642                 if mobj is None:
2643                         self._downloader.trouble(u'ERROR: unable to extract title')
2644                         return
2645                 file_title = mobj.group(1).decode('utf-8')
2646
2647                 try:
2648                         # Process file information
2649                         self._downloader.process_info({
2650                                 'id':           file_id.decode('utf-8'),
2651                                 'url':          file_url.decode('utf-8'),
2652                                 'uploader':     u'NA',
2653                                 'upload_date':  u'NA',
2654                                 'title':        file_title,
2655                                 'stitle':       file_title,
2656                                 'ext':          file_extension.decode('utf-8'),
2657                                 'format':       u'NA',
2658                                 'player_url':   None,
2659                         })
2660                 except UnavailableVideoError, err:
2661                         self._downloader.trouble(u'ERROR: unable to download file')
2662
2663
2664 class FacebookIE(InfoExtractor):
2665         """Information Extractor for Facebook"""
2666
2667         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2668         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2669         _NETRC_MACHINE = 'facebook'
2670         _available_formats = ['highqual', 'lowqual']
2671         _video_extensions = {
2672                 'highqual': 'mp4',
2673                 'lowqual': 'mp4',
2674         }
2675         IE_NAME = u'facebook'
2676
2677         def __init__(self, downloader=None):
2678                 InfoExtractor.__init__(self, downloader)
2679
2680         def _reporter(self, message):
2681                 """Add header and report message."""
2682                 self._downloader.to_screen(u'[facebook] %s' % message)
2683
2684         def report_login(self):
2685                 """Report attempt to log in."""
2686                 self._reporter(u'Logging in')
2687
2688         def report_video_webpage_download(self, video_id):
2689                 """Report attempt to download video webpage."""
2690                 self._reporter(u'%s: Downloading video webpage' % video_id)
2691
2692         def report_information_extraction(self, video_id):
2693                 """Report attempt to extract video information."""
2694                 self._reporter(u'%s: Extracting video information' % video_id)
2695
2696         def _parse_page(self, video_webpage):
2697                 """Extract video information from page"""
2698                 # General data
2699                 data = {'title': r'class="video_title datawrap">(.*?)</',
2700                         'description': r'<div class="datawrap">(.*?)</div>',
2701                         'owner': r'\("video_owner_name", "(.*?)"\)',
2702                         'upload_date': r'data-date="(.*?)"',
2703                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2704                         }
2705                 video_info = {}
2706                 for piece in data.keys():
2707                         mobj = re.search(data[piece], video_webpage)
2708                         if mobj is not None:
2709                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2710
2711                 # Video urls
2712                 video_urls = {}
2713                 for fmt in self._available_formats:
2714                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2715                         if mobj is not None:
2716                                 # URL is in a Javascript segment inside an escaped Unicode format within
2717                                 # the generally utf-8 page
2718                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2719                 video_info['video_urls'] = video_urls
2720
2721                 return video_info
2722
2723         def _real_initialize(self):
2724                 if self._downloader is None:
2725                         return
2726
2727                 useremail = None
2728                 password = None
2729                 downloader_params = self._downloader.params
2730
2731                 # Attempt to use provided username and password or .netrc data
2732                 if downloader_params.get('username', None) is not None:
2733                         useremail = downloader_params['username']
2734                         password = downloader_params['password']
2735                 elif downloader_params.get('usenetrc', False):
2736                         try:
2737                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2738                                 if info is not None:
2739                                         useremail = info[0]
2740                                         password = info[2]
2741                                 else:
2742                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2743                         except (IOError, netrc.NetrcParseError), err:
2744                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2745                                 return
2746
2747                 if useremail is None:
2748                         return
2749
2750                 # Log in
2751                 login_form = {
2752                         'email': useremail,
2753                         'pass': password,
2754                         'login': 'Log+In'
2755                         }
2756                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2757                 try:
2758                         self.report_login()
2759                         login_results = urllib2.urlopen(request).read()
2760                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2761                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2762                                 return
2763                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2764                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2765                         return
2766
2767         def _real_extract(self, url):
2768                 mobj = re.match(self._VALID_URL, url)
2769                 if mobj is None:
2770                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2771                         return
2772                 video_id = mobj.group('ID')
2773
2774                 # Get video webpage
2775                 self.report_video_webpage_download(video_id)
2776                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2777                 try:
2778                         page = urllib2.urlopen(request)
2779                         video_webpage = page.read()
2780                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2781                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2782                         return
2783
2784                 # Start extracting information
2785                 self.report_information_extraction(video_id)
2786
2787                 # Extract information
2788                 video_info = self._parse_page(video_webpage)
2789
2790                 # uploader
2791                 if 'owner' not in video_info:
2792                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2793                         return
2794                 video_uploader = video_info['owner']
2795
2796                 # title
2797                 if 'title' not in video_info:
2798                         self._downloader.trouble(u'ERROR: unable to extract video title')
2799                         return
2800                 video_title = video_info['title']
2801                 video_title = video_title.decode('utf-8')
2802                 video_title = sanitize_title(video_title)
2803
2804                 # simplified title
2805                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2806                 simple_title = simple_title.strip(ur'_')
2807
2808                 # thumbnail image
2809                 if 'thumbnail' not in video_info:
2810                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2811                         video_thumbnail = ''
2812                 else:
2813                         video_thumbnail = video_info['thumbnail']
2814
2815                 # upload date
2816                 upload_date = u'NA'
2817                 if 'upload_date' in video_info:
2818                         upload_time = video_info['upload_date']
2819                         timetuple = email.utils.parsedate_tz(upload_time)
2820                         if timetuple is not None:
2821                                 try:
2822                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2823                                 except:
2824                                         pass
2825
2826                 # description
2827                 video_description = video_info.get('description', 'No description available.')
2828
2829                 url_map = video_info['video_urls']
2830                 if len(url_map.keys()) > 0:
2831                         # Decide which formats to download
2832                         req_format = self._downloader.params.get('format', None)
2833                         format_limit = self._downloader.params.get('format_limit', None)
2834
2835                         if format_limit is not None and format_limit in self._available_formats:
2836                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2837                         else:
2838                                 format_list = self._available_formats
2839                         existing_formats = [x for x in format_list if x in url_map]
2840                         if len(existing_formats) == 0:
2841                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2842                                 return
2843                         if req_format is None:
2844                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2845                         elif req_format == 'worst':
2846                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2847                         elif req_format == '-1':
2848                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2849                         else:
2850                                 # Specific format
2851                                 if req_format not in url_map:
2852                                         self._downloader.trouble(u'ERROR: requested format not available')
2853                                         return
2854                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2855
2856                 for format_param, video_real_url in video_url_list:
2857
2858                         # At this point we have a new video
2859                         self._downloader.increment_downloads()
2860
2861                         # Extension
2862                         video_extension = self._video_extensions.get(format_param, 'mp4')
2863
2864                         try:
2865                                 # Process video information
2866                                 self._downloader.process_info({
2867                                         'id':           video_id.decode('utf-8'),
2868                                         'url':          video_real_url.decode('utf-8'),
2869                                         'uploader':     video_uploader.decode('utf-8'),
2870                                         'upload_date':  upload_date,
2871                                         'title':        video_title,
2872                                         'stitle':       simple_title,
2873                                         'ext':          video_extension.decode('utf-8'),
2874                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2875                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2876                                         'description':  video_description.decode('utf-8'),
2877                                         'player_url':   None,
2878                                 })
2879                         except UnavailableVideoError, err:
2880                                 self._downloader.trouble(u'\nERROR: unable to download video')
2881
2882 class BlipTVIE(InfoExtractor):
2883         """Information extractor for blip.tv"""
2884
2885         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2886         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2887         IE_NAME = u'blip.tv'
2888
2889         def report_extraction(self, file_id):
2890                 """Report information extraction."""
2891                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2892
2893         def _simplify_title(self, title):
2894                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2895                 res = res.strip(ur'_')
2896                 return res
2897
2898         def _real_extract(self, url):
2899                 mobj = re.match(self._VALID_URL, url)
2900                 if mobj is None:
2901                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2902                         return
2903
2904                 if '?' in url:
2905                         cchar = '&'
2906                 else:
2907                         cchar = '?'
2908                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2909                 request = urllib2.Request(json_url)
2910                 self.report_extraction(mobj.group(1))
2911                 try:
2912                         json_code = urllib2.urlopen(request).read()
2913                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2914                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2915                         return
2916                 try:
2917                         json_data = json.loads(json_code)
2918                         if 'Post' in json_data:
2919                                 data = json_data['Post']
2920                         else:
2921                                 data = json_data
2922
2923                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2924                         video_url = data['media']['url']
2925                         umobj = re.match(self._URL_EXT, video_url)
2926                         if umobj is None:
2927                                 raise ValueError('Can not determine filename extension')
2928                         ext = umobj.group(1)
2929
2930                         self._downloader.increment_downloads()
2931
2932                         info = {
2933                                 'id': data['item_id'],
2934                                 'url': video_url,
2935                                 'uploader': data['display_name'],
2936                                 'upload_date': upload_date,
2937                                 'title': data['title'],
2938                                 'stitle': self._simplify_title(data['title']),
2939                                 'ext': ext,
2940                                 'format': data['media']['mimeType'],
2941                                 'thumbnail': data['thumbnailUrl'],
2942                                 'description': data['description'],
2943                                 'player_url': data['embedUrl']
2944                         }
2945                 except (ValueError,KeyError), err:
2946                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2947                         return
2948
2949                 try:
2950                         self._downloader.process_info(info)
2951                 except UnavailableVideoError, err:
2952                         self._downloader.trouble(u'\nERROR: unable to download video')
2953
2954
2955 class MyVideoIE(InfoExtractor):
2956         """Information Extractor for myvideo.de."""
2957
2958         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2959         IE_NAME = u'myvideo'
2960
2961         def __init__(self, downloader=None):
2962                 InfoExtractor.__init__(self, downloader)
2963         
2964         def report_download_webpage(self, video_id):
2965                 """Report webpage download."""
2966                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2967
2968         def report_extraction(self, video_id):
2969                 """Report information extraction."""
2970                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2971
2972         def _real_initialize(self):
2973                 return
2974
2975         def _real_extract(self,url):
2976                 mobj = re.match(self._VALID_URL, url)
2977                 if mobj is None:
2978                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2979                         return
2980
2981                 video_id = mobj.group(1)
2982                 simple_title = mobj.group(2).decode('utf-8')
2983                 # should actually not be necessary
2984                 simple_title = sanitize_title(simple_title)
2985                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2986
2987                 # Get video webpage
2988                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2989                 try:
2990                         self.report_download_webpage(video_id)
2991                         webpage = urllib2.urlopen(request).read()
2992                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2993                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2994                         return
2995
2996                 self.report_extraction(video_id)
2997                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2998                                  webpage)
2999                 if mobj is None:
3000                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3001                         return
3002                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3003
3004                 mobj = re.search('<title>([^<]+)</title>', webpage)
3005                 if mobj is None:
3006                         self._downloader.trouble(u'ERROR: unable to extract title')
3007                         return
3008
3009                 video_title = mobj.group(1)
3010                 video_title = sanitize_title(video_title)
3011
3012                 try:
3013                         print(video_url)
3014                         self._downloader.process_info({
3015                                 'id':           video_id,
3016                                 'url':          video_url,
3017                                 'uploader':     u'NA',
3018                                 'upload_date':  u'NA',
3019                                 'title':        video_title,
3020                                 'stitle':       simple_title,
3021                                 'ext':          u'flv',
3022                                 'format':       u'NA',
3023                                 'player_url':   None,
3024                         })
3025                 except UnavailableVideoError:
3026                         self._downloader.trouble(u'\nERROR: Unable to download video')
3027
3028 class ComedyCentralIE(InfoExtractor):
3029         """Information extractor for The Daily Show and Colbert Report """
3030
3031         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3032         IE_NAME = u'comedycentral'
3033
3034         def report_extraction(self, episode_id):
3035                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3036         
3037         def report_config_download(self, episode_id):
3038                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3039
3040         def report_index_download(self, episode_id):
3041                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3042
3043         def report_player_url(self, episode_id):
3044                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3045
3046         def _simplify_title(self, title):
3047                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3048                 res = res.strip(ur'_')
3049                 return res
3050
3051         def _real_extract(self, url):
3052                 mobj = re.match(self._VALID_URL, url)
3053                 if mobj is None:
3054                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3055                         return
3056
3057                 if mobj.group('shortname'):
3058                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3059                                 url = 'http://www.thedailyshow.com/full-episodes/'
3060                         else:
3061                                 url = 'http://www.colbertnation.com/full-episodes/'
3062                         mobj = re.match(self._VALID_URL, url)
3063                         assert mobj is not None
3064
3065                 dlNewest = not mobj.group('episode')
3066                 if dlNewest:
3067                         epTitle = mobj.group('showname')
3068                 else:
3069                         epTitle = mobj.group('episode')
3070
3071                 req = urllib2.Request(url)
3072                 self.report_extraction(epTitle)
3073                 try:
3074                         htmlHandle = urllib2.urlopen(req)
3075                         html = htmlHandle.read()
3076                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3077                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3078                         return
3079                 if dlNewest:
3080                         url = htmlHandle.geturl()
3081                         mobj = re.match(self._VALID_URL, url)
3082                         if mobj is None:
3083                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3084                                 return
3085                         if mobj.group('episode') == '':
3086                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3087                                 return
3088                         epTitle = mobj.group('episode')
3089
3090                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3091                 if len(mMovieParams) == 0:
3092                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3093                         return
3094
3095                 playerUrl_raw = mMovieParams[0][0]
3096                 self.report_player_url(epTitle)
3097                 try:
3098                         urlHandle = urllib2.urlopen(playerUrl_raw)
3099                         playerUrl = urlHandle.geturl()
3100                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3101                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3102                         return
3103
3104                 uri = mMovieParams[0][1]
3105                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3106                 self.report_index_download(epTitle)
3107                 try:
3108                         indexXml = urllib2.urlopen(indexUrl).read()
3109                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3110                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3111                         return
3112
3113                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3114                 itemEls = idoc.findall('.//item')
3115                 for itemEl in itemEls:
3116                         mediaId = itemEl.findall('./guid')[0].text
3117                         shortMediaId = mediaId.split(':')[-1]
3118                         showId = mediaId.split(':')[-2].replace('.com', '')
3119                         officialTitle = itemEl.findall('./title')[0].text
3120                         officialDate = itemEl.findall('./pubDate')[0].text
3121
3122                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3123                                                 urllib.urlencode({'uri': mediaId}))
3124                         configReq = urllib2.Request(configUrl)
3125                         self.report_config_download(epTitle)
3126                         try:
3127                                 configXml = urllib2.urlopen(configReq).read()
3128                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3129                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3130                                 return
3131
3132                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3133                         turls = []
3134                         for rendition in cdoc.findall('.//rendition'):
3135                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3136                                 turls.append(finfo)
3137
3138                         if len(turls) == 0:
3139                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3140                                 continue
3141
3142                         # For now, just pick the highest bitrate
3143                         format,video_url = turls[-1]
3144
3145                         self._downloader.increment_downloads()
3146
3147                         effTitle = showId + '-' + epTitle
3148                         info = {
3149                                 'id': shortMediaId,
3150                                 'url': video_url,
3151                                 'uploader': showId,
3152                                 'upload_date': officialDate,
3153                                 'title': effTitle,
3154                                 'stitle': self._simplify_title(effTitle),
3155                                 'ext': 'mp4',
3156                                 'format': format,
3157                                 'thumbnail': None,
3158                                 'description': officialTitle,
3159                                 'player_url': playerUrl
3160                         }
3161
3162                         try:
3163                                 self._downloader.process_info(info)
3164                         except UnavailableVideoError, err:
3165                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3166                                 continue
3167
3168
3169 class EscapistIE(InfoExtractor):
3170         """Information extractor for The Escapist """
3171
3172         _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3173         IE_NAME = u'escapist'
3174
3175         def report_extraction(self, showName):
3176                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3177
3178         def report_config_download(self, showName):
3179                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3180
3181         def _simplify_title(self, title):
3182                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3183                 res = res.strip(ur'_')
3184                 return res
3185
3186         def _real_extract(self, url):
3187                 htmlParser = HTMLParser.HTMLParser()
3188
3189                 mobj = re.match(self._VALID_URL, url)
3190                 if mobj is None:
3191                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3192                         return
3193                 showName = mobj.group('showname')
3194                 videoId = mobj.group('episode')
3195
3196                 self.report_extraction(showName)
3197                 try:
3198                         webPage = urllib2.urlopen(url).read()
3199                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3200                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3201                         return
3202
3203                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3204                 description = htmlParser.unescape(descMatch.group(1))
3205                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3206                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3207                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3208                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3209                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3210                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3211
3212                 self.report_config_download(showName)
3213                 try:
3214                         configJSON = urllib2.urlopen(configUrl).read()
3215                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3216                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3217                         return
3218
3219                 # Technically, it's JavaScript, not JSON
3220                 configJSON = configJSON.replace("'", '"')
3221
3222                 try:
3223                         config = json.loads(configJSON)
3224                 except (ValueError,), err:
3225                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3226                         return
3227
3228                 playlist = config['playlist']
3229                 videoUrl = playlist[1]['url']
3230
3231                 self._downloader.increment_downloads()
3232                 info = {
3233                         'id': videoId,
3234                         'url': videoUrl,
3235                         'uploader': showName,
3236                         'upload_date': None,
3237                         'title': showName,
3238                         'stitle': self._simplify_title(showName),
3239                         'ext': 'flv',
3240                         'format': 'flv',
3241                         'thumbnail': imgUrl,
3242                         'description': description,
3243                         'player_url': playerUrl,
3244                 }
3245
3246                 try:
3247                         self._downloader.process_info(info)
3248                 except UnavailableVideoError, err:
3249                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3250
3251
3252
3253 class PostProcessor(object):
3254         """Post Processor class.
3255
3256         PostProcessor objects can be added to downloaders with their
3257         add_post_processor() method. When the downloader has finished a
3258         successful download, it will take its internal chain of PostProcessors
3259         and start calling the run() method on each one of them, first with
3260         an initial argument and then with the returned value of the previous
3261         PostProcessor.
3262
3263         The chain will be stopped if one of them ever returns None or the end
3264         of the chain is reached.
3265
3266         PostProcessor objects follow a "mutual registration" process similar
3267         to InfoExtractor objects.
3268         """
3269
3270         _downloader = None
3271
3272         def __init__(self, downloader=None):
3273                 self._downloader = downloader
3274
3275         def set_downloader(self, downloader):
3276                 """Sets the downloader for this PP."""
3277                 self._downloader = downloader
3278
3279         def run(self, information):
3280                 """Run the PostProcessor.
3281
3282                 The "information" argument is a dictionary like the ones
3283                 composed by InfoExtractors. The only difference is that this
3284                 one has an extra field called "filepath" that points to the
3285                 downloaded file.
3286
3287                 When this method returns None, the postprocessing chain is
3288                 stopped. However, this method may return an information
3289                 dictionary that will be passed to the next postprocessing
3290                 object in the chain. It can be the one it received after
3291                 changing some fields.
3292
3293                 In addition, this method may raise a PostProcessingError
3294                 exception that will be taken into account by the downloader
3295                 it was called from.
3296                 """
3297                 return information # by default, do nothing
3298
3299
3300 class FFmpegExtractAudioPP(PostProcessor):
3301
3302         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3303                 PostProcessor.__init__(self, downloader)
3304                 if preferredcodec is None:
3305                         preferredcodec = 'best'
3306                 self._preferredcodec = preferredcodec
3307                 self._preferredquality = preferredquality
3308                 self._keepvideo = keepvideo
3309
3310         @staticmethod
3311         def get_audio_codec(path):
3312                 try:
3313                         cmd = ['ffprobe', '-show_streams', '--', path]
3314                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3315                         output = handle.communicate()[0]
3316                         if handle.wait() != 0:
3317                                 return None
3318                 except (IOError, OSError):
3319                         return None
3320                 audio_codec = None
3321                 for line in output.split('\n'):
3322                         if line.startswith('codec_name='):
3323                                 audio_codec = line.split('=')[1].strip()
3324                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3325                                 return audio_codec
3326                 return None
3327
3328         @staticmethod
3329         def run_ffmpeg(path, out_path, codec, more_opts):
3330                 try:
3331                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3332                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3333                         return (ret == 0)
3334                 except (IOError, OSError):
3335                         return False
3336
3337         def run(self, information):
3338                 path = information['filepath']
3339
3340                 filecodec = self.get_audio_codec(path)
3341                 if filecodec is None:
3342                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3343                         return None
3344
3345                 more_opts = []
3346                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3347                         if filecodec == 'aac' or filecodec == 'mp3':
3348                                 # Lossless if possible
3349                                 acodec = 'copy'
3350                                 extension = filecodec
3351                                 if filecodec == 'aac':
3352                                         more_opts = ['-f', 'adts']
3353                         else:
3354                                 # MP3 otherwise.
3355                                 acodec = 'libmp3lame'
3356                                 extension = 'mp3'
3357                                 more_opts = []
3358                                 if self._preferredquality is not None:
3359                                         more_opts += ['-ab', self._preferredquality]
3360                 else:
3361                         # We convert the audio (lossy)
3362                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3363                         extension = self._preferredcodec
3364                         more_opts = []
3365                         if self._preferredquality is not None:
3366                                 more_opts += ['-ab', self._preferredquality]
3367                         if self._preferredcodec == 'aac':
3368                                 more_opts += ['-f', 'adts']
3369
3370                 (prefix, ext) = os.path.splitext(path)
3371                 new_path = prefix + '.' + extension
3372                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3373                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3374
3375                 if not status:
3376                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3377                         return None
3378
3379                 # Try to update the date time for extracted audio file.
3380                 if information.get('filetime') is not None:
3381                         try:
3382                                 os.utime(new_path, (time.time(), information['filetime']))
3383                         except:
3384                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3385
3386                 if not self._keepvideo:
3387                         try:
3388                                 os.remove(path)
3389                         except (IOError, OSError):
3390                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3391                                 return None
3392
3393                 information['filepath'] = new_path
3394                 return information
3395
3396
3397 def updateSelf(downloader, filename):
3398         ''' Update the program file with the latest version from the repository '''
3399         # Note: downloader only used for options
3400         if not os.access(filename, os.W_OK):
3401                 sys.exit('ERROR: no write permissions on %s' % filename)
3402
3403         downloader.to_screen('Updating to latest version...')
3404
3405         try:
3406                 try:
3407                         urlh = urllib.urlopen(UPDATE_URL)
3408                         newcontent = urlh.read()
3409                 finally:
3410                         urlh.close()
3411         except (IOError, OSError), err:
3412                 sys.exit('ERROR: unable to download latest version')
3413
3414         try:
3415                 outf = open(filename, 'wb')
3416                 try:
3417                         outf.write(newcontent)
3418                 finally:
3419                         outf.close()
3420         except (IOError, OSError), err:
3421                 sys.exit('ERROR: unable to overwrite current version')
3422
3423         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3424
3425 def parseOpts():
3426         # Deferred imports
3427         import getpass
3428         import optparse
3429
3430         def _format_option_string(option):
3431                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3432
3433                 opts = []
3434
3435                 if option._short_opts: opts.append(option._short_opts[0])
3436                 if option._long_opts: opts.append(option._long_opts[0])
3437                 if len(opts) > 1: opts.insert(1, ', ')
3438
3439                 if option.takes_value(): opts.append(' %s' % option.metavar)
3440
3441                 return "".join(opts)
3442
3443         def _find_term_columns():
3444                 columns = os.environ.get('COLUMNS', None)
3445                 if columns:
3446                         return int(columns)
3447
3448                 try:
3449                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3450                         out,err = sp.communicate()
3451                         return int(out.split()[1])
3452                 except:
3453                         pass
3454                 return None
3455
3456         max_width = 80
3457         max_help_position = 80
3458
3459         # No need to wrap help messages if we're on a wide console
3460         columns = _find_term_columns()
3461         if columns: max_width = columns
3462
3463         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3464         fmt.format_option_strings = _format_option_string
3465
3466         kw = {
3467                 'version'   : __version__,
3468                 'formatter' : fmt,
3469                 'usage' : '%prog [options] url [url...]',
3470                 'conflict_handler' : 'resolve',
3471         }
3472
3473         parser = optparse.OptionParser(**kw)
3474
3475         # option groups
3476         general        = optparse.OptionGroup(parser, 'General Options')
3477         selection      = optparse.OptionGroup(parser, 'Video Selection')
3478         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3479         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3480         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3481         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3482         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3483
3484         general.add_option('-h', '--help',
3485                         action='help', help='print this help text and exit')
3486         general.add_option('-v', '--version',
3487                         action='version', help='print program version and exit')
3488         general.add_option('-U', '--update',
3489                         action='store_true', dest='update_self', help='update this program to latest version')
3490         general.add_option('-i', '--ignore-errors',
3491                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3492         general.add_option('-r', '--rate-limit',
3493                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3494         general.add_option('-R', '--retries',
3495                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3496         general.add_option('--dump-user-agent',
3497                         action='store_true', dest='dump_user_agent',
3498                         help='display the current browser identification', default=False)
3499         general.add_option('--list-extractors',
3500                         action='store_true', dest='list_extractors',
3501                         help='List all supported extractors and the URLs they would handle', default=False)
3502
3503         selection.add_option('--playlist-start',
3504                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3505         selection.add_option('--playlist-end',
3506                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3507         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3508         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3509
3510         authentication.add_option('-u', '--username',
3511                         dest='username', metavar='USERNAME', help='account username')
3512         authentication.add_option('-p', '--password',
3513                         dest='password', metavar='PASSWORD', help='account password')
3514         authentication.add_option('-n', '--netrc',
3515                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3516
3517
3518         video_format.add_option('-f', '--format',
3519                         action='store', dest='format', metavar='FORMAT', help='video format code')
3520         video_format.add_option('--all-formats',
3521                         action='store_const', dest='format', help='download all available video formats', const='all')
3522         video_format.add_option('--max-quality',
3523                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3524
3525
3526         verbosity.add_option('-q', '--quiet',
3527                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3528         verbosity.add_option('-s', '--simulate',
3529                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3530         verbosity.add_option('--skip-download',
3531                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3532         verbosity.add_option('-g', '--get-url',
3533                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3534         verbosity.add_option('-e', '--get-title',
3535                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3536         verbosity.add_option('--get-thumbnail',
3537                         action='store_true', dest='getthumbnail',
3538                         help='simulate, quiet but print thumbnail URL', default=False)
3539         verbosity.add_option('--get-description',
3540                         action='store_true', dest='getdescription',
3541                         help='simulate, quiet but print video description', default=False)
3542         verbosity.add_option('--get-filename',
3543                         action='store_true', dest='getfilename',
3544                         help='simulate, quiet but print output filename', default=False)
3545         verbosity.add_option('--get-format',
3546                         action='store_true', dest='getformat',
3547                         help='simulate, quiet but print output format', default=False)
3548         verbosity.add_option('--no-progress',
3549                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3550         verbosity.add_option('--console-title',
3551                         action='store_true', dest='consoletitle',
3552                         help='display progress in console titlebar', default=False)
3553
3554
3555         filesystem.add_option('-t', '--title',
3556                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3557         filesystem.add_option('-l', '--literal',
3558                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3559         filesystem.add_option('-A', '--auto-number',
3560                         action='store_true', dest='autonumber',
3561                         help='number downloaded files starting from 00000', default=False)
3562         filesystem.add_option('-o', '--output',
3563                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3564         filesystem.add_option('-a', '--batch-file',
3565                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3566         filesystem.add_option('-w', '--no-overwrites',
3567                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3568         filesystem.add_option('-c', '--continue',
3569                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3570         filesystem.add_option('--cookies',
3571                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3572         filesystem.add_option('--no-part',
3573                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3574         filesystem.add_option('--no-mtime',
3575                         action='store_false', dest='updatetime',
3576                         help='do not use the Last-modified header to set the file modification time', default=True)
3577         filesystem.add_option('--write-description',
3578                         action='store_true', dest='writedescription',
3579                         help='write video description to a .description file', default=False)
3580         filesystem.add_option('--write-info-json',
3581                         action='store_true', dest='writeinfojson',
3582                         help='write video metadata to a .info.json file', default=False)
3583
3584
3585         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3586                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3587         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3588                         help='"best", "aac" or "mp3"; best by default')
3589         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3590                         help='ffmpeg audio bitrate specification, 128k by default')
3591         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3592                         help='keeps the video file on disk after the post-processing; the video is erased by default')
3593
3594
3595         parser.add_option_group(general)
3596         parser.add_option_group(selection)
3597         parser.add_option_group(filesystem)
3598         parser.add_option_group(verbosity)
3599         parser.add_option_group(video_format)
3600         parser.add_option_group(authentication)
3601         parser.add_option_group(postproc)
3602
3603         opts, args = parser.parse_args()
3604
3605         return parser, opts, args
3606
3607 def gen_extractors():
3608         """ Return a list of an instance of every supported extractor.
3609         The order does matter; the first extractor matched is the one handling the URL.
3610         """
3611         youtube_ie = YoutubeIE()
3612         google_ie = GoogleIE()
3613         yahoo_ie = YahooIE()
3614         return [
3615                 youtube_ie,
3616                 MetacafeIE(youtube_ie),
3617                 DailymotionIE(),
3618                 YoutubePlaylistIE(youtube_ie),
3619                 YoutubeUserIE(youtube_ie),
3620                 YoutubeSearchIE(youtube_ie),
3621                 google_ie,
3622                 GoogleSearchIE(google_ie),
3623                 PhotobucketIE(),
3624                 yahoo_ie,
3625                 YahooSearchIE(yahoo_ie),
3626                 DepositFilesIE(),
3627                 FacebookIE(),
3628                 BlipTVIE(),
3629                 VimeoIE(),
3630                 MyVideoIE(),
3631                 ComedyCentralIE(),
3632                 EscapistIE(),
3633
3634                 GenericIE()
3635         ]
3636
3637 def main():
3638         parser, opts, args = parseOpts()
3639
3640         # Open appropriate CookieJar
3641         if opts.cookiefile is None:
3642                 jar = cookielib.CookieJar()
3643         else:
3644                 try:
3645                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3646                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3647                                 jar.load()
3648                 except (IOError, OSError), err:
3649                         sys.exit(u'ERROR: unable to open cookie file')
3650
3651         # Dump user agent
3652         if opts.dump_user_agent:
3653                 print std_headers['User-Agent']
3654                 sys.exit(0)
3655
3656         # Batch file verification
3657         batchurls = []
3658         if opts.batchfile is not None:
3659                 try:
3660                         if opts.batchfile == '-':
3661                                 batchfd = sys.stdin
3662                         else:
3663                                 batchfd = open(opts.batchfile, 'r')
3664                         batchurls = batchfd.readlines()
3665                         batchurls = [x.strip() for x in batchurls]
3666                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3667                 except IOError:
3668                         sys.exit(u'ERROR: batch file could not be read')
3669         all_urls = batchurls + args
3670
3671         # General configuration
3672         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3673         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3674         urllib2.install_opener(opener)
3675         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3676
3677         extractors = gen_extractors()
3678
3679         if opts.list_extractors:
3680                 for ie in extractors:
3681                         print(ie.IE_NAME)
3682                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3683                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3684                         for mu in matchedUrls:
3685                                 print(u'  ' + mu)
3686                 sys.exit(0)
3687
3688         # Conflicting, missing and erroneous options
3689         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3690                 parser.error(u'using .netrc conflicts with giving username/password')
3691         if opts.password is not None and opts.username is None:
3692                 parser.error(u'account username missing')
3693         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3694                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3695         if opts.usetitle and opts.useliteral:
3696                 parser.error(u'using title conflicts with using literal title')
3697         if opts.username is not None and opts.password is None:
3698                 opts.password = getpass.getpass(u'Type account password and press return:')
3699         if opts.ratelimit is not None:
3700                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3701                 if numeric_limit is None:
3702                         parser.error(u'invalid rate limit specified')
3703                 opts.ratelimit = numeric_limit
3704         if opts.retries is not None:
3705                 try:
3706                         opts.retries = long(opts.retries)
3707                 except (TypeError, ValueError), err:
3708                         parser.error(u'invalid retry count specified')
3709         try:
3710                 opts.playliststart = int(opts.playliststart)
3711                 if opts.playliststart <= 0:
3712                         raise ValueError(u'Playlist start must be positive')
3713         except (TypeError, ValueError), err:
3714                 parser.error(u'invalid playlist start number specified')
3715         try:
3716                 opts.playlistend = int(opts.playlistend)
3717                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3718                         raise ValueError(u'Playlist end must be greater than playlist start')
3719         except (TypeError, ValueError), err:
3720                 parser.error(u'invalid playlist end number specified')
3721         if opts.extractaudio:
3722                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3723                         parser.error(u'invalid audio format specified')
3724
3725         # File downloader
3726         fd = FileDownloader({
3727                 'usenetrc': opts.usenetrc,
3728                 'username': opts.username,
3729                 'password': opts.password,
3730                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3731                 'forceurl': opts.geturl,
3732                 'forcetitle': opts.gettitle,
3733                 'forcethumbnail': opts.getthumbnail,
3734                 'forcedescription': opts.getdescription,
3735                 'forcefilename': opts.getfilename,
3736                 'forceformat': opts.getformat,
3737                 'simulate': opts.simulate,
3738                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3739                 'format': opts.format,
3740                 'format_limit': opts.format_limit,
3741                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3742                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3743                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3744                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3745                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3746                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3747                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3748                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3749                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3750                         or u'%(id)s.%(ext)s'),
3751                 'ignoreerrors': opts.ignoreerrors,
3752                 'ratelimit': opts.ratelimit,
3753                 'nooverwrites': opts.nooverwrites,
3754                 'retries': opts.retries,
3755                 'continuedl': opts.continue_dl,
3756                 'noprogress': opts.noprogress,
3757                 'playliststart': opts.playliststart,
3758                 'playlistend': opts.playlistend,
3759                 'logtostderr': opts.outtmpl == '-',
3760                 'consoletitle': opts.consoletitle,
3761                 'nopart': opts.nopart,
3762                 'updatetime': opts.updatetime,
3763                 'writedescription': opts.writedescription,
3764                 'writeinfojson': opts.writeinfojson,
3765                 'matchtitle': opts.matchtitle,
3766                 'rejecttitle': opts.rejecttitle,
3767                 })
3768         for extractor in extractors:
3769                 fd.add_info_extractor(extractor)
3770
3771         # PostProcessors
3772         if opts.extractaudio:
3773                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3774
3775         # Update version
3776         if opts.update_self:
3777                 updateSelf(fd, sys.argv[0])
3778
3779         # Maybe do nothing
3780         if len(all_urls) < 1:
3781                 if not opts.update_self:
3782                         parser.error(u'you must provide at least one URL')
3783                 else:
3784                         sys.exit()
3785         retcode = fd.download(all_urls)
3786
3787         # Dump cookie jar if requested
3788         if opts.cookiefile is not None:
3789                 try:
3790                         jar.save()
3791                 except (IOError, OSError), err:
3792                         sys.exit(u'ERROR: unable to save cookie jar')
3793
3794         sys.exit(retcode)
3795
3796
3797 if __name__ == '__main__':
3798         try:
3799                 main()
3800         except DownloadError:
3801                 sys.exit(1)
3802         except SameFileError:
3803                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3804         except KeyboardInterrupt:
3805                 sys.exit(u'\nERROR: Interrupted by user')
3806
3807 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: