Update LATEST_VERSION (and wait for a script to do it so I do not forget ;) )
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         )
16
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.15'
19
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
21
22 import cookielib
23 import datetime
24 import gzip
25 import htmlentitydefs
26 import HTMLParser
27 import httplib
28 import locale
29 import math
30 import netrc
31 import os
32 import os.path
33 import re
34 import socket
35 import string
36 import subprocess
37 import sys
38 import time
39 import urllib
40 import urllib2
41 import warnings
42 import zlib
43
44 if os.name == 'nt':
45         import ctypes
46
47 try:
48         import email.utils
49 except ImportError: # Python 2.4
50         import email.Utils
51 try:
52         import cStringIO as StringIO
53 except ImportError:
54         import StringIO
55
56 # parse_qs was moved from the cgi module to the urlparse module recently.
57 try:
58         from urlparse import parse_qs
59 except ImportError:
60         from cgi import parse_qs
61
62 try:
63         import lxml.etree
64 except ImportError:
65         pass # Handled below
66
67 try:
68         import xml.etree.ElementTree
69 except ImportError: # Python<2.5
70         pass # Not officially supported, but let it slip
71
72 std_headers = {
73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76         'Accept-Encoding': 'gzip, deflate',
77         'Accept-Language': 'en-us,en;q=0.5',
78 }
79
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
82 try:
83         import json
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
85         import re
86         class json(object):
87                 @staticmethod
88                 def loads(s):
89                         s = s.decode('UTF-8')
90                         def raiseError(msg, i):
91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92                         def skipSpace(i, expectMore=True):
93                                 while i < len(s) and s[i] in ' \t\r\n':
94                                         i += 1
95                                 if expectMore:
96                                         if i >= len(s):
97                                                 raiseError('Premature end', i)
98                                 return i
99                         def decodeEscape(match):
100                                 esc = match.group(1)
101                                 _STATIC = {
102                                         '"': '"',
103                                         '\\': '\\',
104                                         '/': '/',
105                                         'b': unichr(0x8),
106                                         'f': unichr(0xc),
107                                         'n': '\n',
108                                         'r': '\r',
109                                         't': '\t',
110                                 }
111                                 if esc in _STATIC:
112                                         return _STATIC[esc]
113                                 if esc[0] == 'u':
114                                         if len(esc) == 1+4:
115                                                 return unichr(int(esc[1:5], 16))
116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
117                                                 hi = int(esc[1:5], 16)
118                                                 low = int(esc[7:11], 16)
119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120                                 raise ValueError('Unknown escape ' + str(esc))
121                         def parseString(i):
122                                 i += 1
123                                 e = i
124                                 while True:
125                                         e = s.index('"', e)
126                                         bslashes = 0
127                                         while s[e-bslashes-1] == '\\':
128                                                 bslashes += 1
129                                         if bslashes % 2 == 1:
130                                                 e += 1
131                                                 continue
132                                         break
133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134                                 stri = rexp.sub(decodeEscape, s[i:e])
135                                 return (e+1,stri)
136                         def parseObj(i):
137                                 i += 1
138                                 res = {}
139                                 i = skipSpace(i)
140                                 if s[i] == '}': # Empty dictionary
141                                         return (i+1,res)
142                                 while True:
143                                         if s[i] != '"':
144                                                 raiseError('Expected a string object key', i)
145                                         i,key = parseString(i)
146                                         i = skipSpace(i)
147                                         if i >= len(s) or s[i] != ':':
148                                                 raiseError('Expected a colon', i)
149                                         i,val = parse(i+1)
150                                         res[key] = val
151                                         i = skipSpace(i)
152                                         if s[i] == '}':
153                                                 return (i+1, res)
154                                         if s[i] != ',':
155                                                 raiseError('Expected comma or closing curly brace', i)
156                                         i = skipSpace(i+1)
157                         def parseArray(i):
158                                 res = []
159                                 i = skipSpace(i+1)
160                                 if s[i] == ']': # Empty array
161                                         return (i+1,res)
162                                 while True:
163                                         i,val = parse(i)
164                                         res.append(val)
165                                         i = skipSpace(i) # Raise exception if premature end
166                                         if s[i] == ']':
167                                                 return (i+1, res)
168                                         if s[i] != ',':
169                                                 raiseError('Expected a comma or closing bracket', i)
170                                         i = skipSpace(i+1)
171                         def parseDiscrete(i):
172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
173                                         if s.startswith(k, i):
174                                                 return (i+len(k), v)
175                                 raiseError('Not a boolean (or null)', i)
176                         def parseNumber(i):
177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178                                 if mobj is None:
179                                         raiseError('Not a number', i)
180                                 nums = mobj.group(1)
181                                 if '.' in nums or 'e' in nums or 'E' in nums:
182                                         return (i+len(nums), float(nums))
183                                 return (i+len(nums), int(nums))
184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185                         def parse(i):
186                                 i = skipSpace(i)
187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
188                                 i = skipSpace(i, False)
189                                 return (i,res)
190                         i,res = parse(0)
191                         if i < len(s):
192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193                         return res
194
195 def preferredencoding():
196         """Get preferred encoding.
197
198         Returns the best encoding scheme for the system, based on
199         locale.getpreferredencoding() and some further tweaks.
200         """
201         def yield_preferredencoding():
202                 try:
203                         pref = locale.getpreferredencoding()
204                         u'TEST'.encode(pref)
205                 except:
206                         pref = 'UTF-8'
207                 while True:
208                         yield pref
209         return yield_preferredencoding().next()
210
211
212 def htmlentity_transform(matchobj):
213         """Transforms an HTML entity to a Unicode character.
214
215         This function receives a match object and is intended to be used with
216         the re.sub() function.
217         """
218         entity = matchobj.group(1)
219
220         # Known non-numeric HTML entity
221         if entity in htmlentitydefs.name2codepoint:
222                 return unichr(htmlentitydefs.name2codepoint[entity])
223
224         # Unicode character
225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
226         if mobj is not None:
227                 numstr = mobj.group(1)
228                 if numstr.startswith(u'x'):
229                         base = 16
230                         numstr = u'0%s' % numstr
231                 else:
232                         base = 10
233                 return unichr(long(numstr, base))
234
235         # Unknown entity in name, return its literal representation
236         return (u'&%s;' % entity)
237
238
239 def sanitize_title(utitle):
240         """Sanitizes a video title so it could be used as part of a filename."""
241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242         return utitle.replace(unicode(os.sep), u'%')
243
244
245 def sanitize_open(filename, open_mode):
246         """Try to open the given filename, and slightly tweak it if this fails.
247
248         Attempts to open the given filename. If this fails, it tries to change
249         the filename slightly, step by step, until it's either able to open it
250         or it fails and raises a final exception, like the standard open()
251         function.
252
253         It returns the tuple (stream, definitive_file_name).
254         """
255         try:
256                 if filename == u'-':
257                         if sys.platform == 'win32':
258                                 import msvcrt
259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260                         return (sys.stdout, filename)
261                 stream = open(filename, open_mode)
262                 return (stream, filename)
263         except (IOError, OSError), err:
264                 # In case of error, try to remove win32 forbidden chars
265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
266
267                 # An exception here should be caught in the caller
268                 stream = open(filename, open_mode)
269                 return (stream, filename)
270
271
272 def timeconvert(timestr):
273         """Convert RFC 2822 defined time string into system timestamp"""
274         timestamp = None
275         timetuple = email.utils.parsedate_tz(timestr)
276         if timetuple is not None:
277                 timestamp = email.utils.mktime_tz(timetuple)
278         return timestamp
279
280
281 class DownloadError(Exception):
282         """Download Error exception.
283
284         This exception may be thrown by FileDownloader objects if they are not
285         configured to continue on errors. They will contain the appropriate
286         error message.
287         """
288         pass
289
290
291 class SameFileError(Exception):
292         """Same File exception.
293
294         This exception will be thrown by FileDownloader objects if they detect
295         multiple files would have to be downloaded to the same file on disk.
296         """
297         pass
298
299
300 class PostProcessingError(Exception):
301         """Post Processing exception.
302
303         This exception may be raised by PostProcessor's .run() method to
304         indicate an error in the postprocessing task.
305         """
306         pass
307
308
309 class UnavailableVideoError(Exception):
310         """Unavailable Format exception.
311
312         This exception will be thrown when a video is requested
313         in a format that is not available for that video.
314         """
315         pass
316
317
318 class ContentTooShortError(Exception):
319         """Content Too Short exception.
320
321         This exception may be raised by FileDownloader objects when a file they
322         download is too small for what the server announced first, indicating
323         the connection was probably interrupted.
324         """
325         # Both in bytes
326         downloaded = None
327         expected = None
328
329         def __init__(self, downloaded, expected):
330                 self.downloaded = downloaded
331                 self.expected = expected
332
333
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335         """Handler for HTTP requests and responses.
336
337         This class, when installed with an OpenerDirector, automatically adds
338         the standard headers to every HTTP request and handles gzipped and
339         deflated responses from web servers. If compression is to be avoided in
340         a particular request, the original request in the program code only has
341         to include the HTTP header "Youtubedl-No-Compression", which will be
342         removed before making the real request.
343
344         Part of this code was copied from:
345
346         http://techknack.net/python-urllib2-handlers/
347
348         Andrew Rowls, the author of that code, agreed to release it to the
349         public domain.
350         """
351
352         @staticmethod
353         def deflate(data):
354                 try:
355                         return zlib.decompress(data, -zlib.MAX_WBITS)
356                 except zlib.error:
357                         return zlib.decompress(data)
358
359         @staticmethod
360         def addinfourl_wrapper(stream, headers, url, code):
361                 if hasattr(urllib2.addinfourl, 'getcode'):
362                         return urllib2.addinfourl(stream, headers, url, code)
363                 ret = urllib2.addinfourl(stream, headers, url)
364                 ret.code = code
365                 return ret
366
367         def http_request(self, req):
368                 for h in std_headers:
369                         if h in req.headers:
370                                 del req.headers[h]
371                         req.add_header(h, std_headers[h])
372                 if 'Youtubedl-no-compression' in req.headers:
373                         if 'Accept-encoding' in req.headers:
374                                 del req.headers['Accept-encoding']
375                         del req.headers['Youtubedl-no-compression']
376                 return req
377
378         def http_response(self, req, resp):
379                 old_resp = resp
380                 # gzip
381                 if resp.headers.get('Content-encoding', '') == 'gzip':
382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384                         resp.msg = old_resp.msg
385                 # deflate
386                 if resp.headers.get('Content-encoding', '') == 'deflate':
387                         gz = StringIO.StringIO(self.deflate(resp.read()))
388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389                         resp.msg = old_resp.msg
390                 return resp
391
392
393 class FileDownloader(object):
394         """File Downloader class.
395
396         File downloader objects are the ones responsible of downloading the
397         actual video file and writing it to disk if the user has requested
398         it, among some other tasks. In most cases there should be one per
399         program. As, given a video URL, the downloader doesn't know how to
400         extract all the needed information, task that InfoExtractors do, it
401         has to pass the URL to one of them.
402
403         For this, file downloader objects have a method that allows
404         InfoExtractors to be registered in a given order. When it is passed
405         a URL, the file downloader handles it to the first InfoExtractor it
406         finds that reports being able to handle it. The InfoExtractor extracts
407         all the information about the video or videos the URL refers to, and
408         asks the FileDownloader to process the video information, possibly
409         downloading the video.
410
411         File downloaders accept a lot of parameters. In order not to saturate
412         the object constructor with arguments, it receives a dictionary of
413         options instead. These options are available through the params
414         attribute for the InfoExtractors to use. The FileDownloader also
415         registers itself as the downloader in charge for the InfoExtractors
416         that are added to it, so this is a "mutual registration".
417
418         Available options:
419
420         username:         Username for authentication purposes.
421         password:         Password for authentication purposes.
422         usenetrc:         Use netrc for authentication instead.
423         quiet:            Do not print messages to stdout.
424         forceurl:         Force printing final URL.
425         forcetitle:       Force printing title.
426         forcethumbnail:   Force printing thumbnail URL.
427         forcedescription: Force printing description.
428         forcefilename:    Force printing final filename.
429         simulate:         Do not download the video files.
430         format:           Video format code.
431         format_limit:     Highest quality format to try.
432         outtmpl:          Template for output names.
433         ignoreerrors:     Do not stop on download errors.
434         ratelimit:        Download speed limit, in bytes/sec.
435         nooverwrites:     Prevent overwriting files.
436         retries:          Number of times to retry for HTTP error 5xx
437         continuedl:       Try to continue downloads if possible.
438         noprogress:       Do not print the progress bar.
439         playliststart:    Playlist item to start at.
440         playlistend:      Playlist item to end at.
441         matchtitle:       Download only matching titles.
442         rejecttitle:      Reject downloads for matching titles.
443         logtostderr:      Log messages to stderr instead of stdout.
444         consoletitle:     Display progress in console window's titlebar.
445         nopart:           Do not use temporary .part files.
446         updatetime:       Use the Last-modified header to set output file timestamps.
447         writedescription: Write the video description to a .description file
448         writeinfojson:    Write the video description to a .info.json file
449         """
450
451         params = None
452         _ies = []
453         _pps = []
454         _download_retcode = None
455         _num_downloads = None
456         _screen_file = None
457
458         def __init__(self, params):
459                 """Create a FileDownloader object with the given options."""
460                 self._ies = []
461                 self._pps = []
462                 self._download_retcode = 0
463                 self._num_downloads = 0
464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
465                 self.params = params
466
467         @staticmethod
468         def format_bytes(bytes):
469                 if bytes is None:
470                         return 'N/A'
471                 if type(bytes) is str:
472                         bytes = float(bytes)
473                 if bytes == 0.0:
474                         exponent = 0
475                 else:
476                         exponent = long(math.log(bytes, 1024.0))
477                 suffix = 'bkMGTPEZY'[exponent]
478                 converted = float(bytes) / float(1024 ** exponent)
479                 return '%.2f%s' % (converted, suffix)
480
481         @staticmethod
482         def calc_percent(byte_counter, data_len):
483                 if data_len is None:
484                         return '---.-%'
485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
487         @staticmethod
488         def calc_eta(start, now, total, current):
489                 if total is None:
490                         return '--:--'
491                 dif = now - start
492                 if current == 0 or dif < 0.001: # One millisecond
493                         return '--:--'
494                 rate = float(current) / dif
495                 eta = long((float(total) - float(current)) / rate)
496                 (eta_mins, eta_secs) = divmod(eta, 60)
497                 if eta_mins > 99:
498                         return '--:--'
499                 return '%02d:%02d' % (eta_mins, eta_secs)
500
501         @staticmethod
502         def calc_speed(start, now, bytes):
503                 dif = now - start
504                 if bytes == 0 or dif < 0.001: # One millisecond
505                         return '%10s' % '---b/s'
506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
508         @staticmethod
509         def best_block_size(elapsed_time, bytes):
510                 new_min = max(bytes / 2.0, 1.0)
511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512                 if elapsed_time < 0.001:
513                         return long(new_max)
514                 rate = bytes / elapsed_time
515                 if rate > new_max:
516                         return long(new_max)
517                 if rate < new_min:
518                         return long(new_min)
519                 return long(rate)
520
521         @staticmethod
522         def parse_bytes(bytestr):
523                 """Parse a string indicating a byte quantity into a long integer."""
524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525                 if matchobj is None:
526                         return None
527                 number = float(matchobj.group(1))
528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529                 return long(round(number * multiplier))
530
531         def add_info_extractor(self, ie):
532                 """Add an InfoExtractor object to the end of the list."""
533                 self._ies.append(ie)
534                 ie.set_downloader(self)
535
536         def add_post_processor(self, pp):
537                 """Add a PostProcessor object to the end of the chain."""
538                 self._pps.append(pp)
539                 pp.set_downloader(self)
540
541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542                 """Print message to stdout if not in quiet mode."""
543                 try:
544                         if not self.params.get('quiet', False):
545                                 terminator = [u'\n', u''][skip_eol]
546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547                         self._screen_file.flush()
548                 except (UnicodeEncodeError), err:
549                         if not ignore_encoding_errors:
550                                 raise
551
552         def to_stderr(self, message):
553                 """Print message to stderr."""
554                 print >>sys.stderr, message.encode(preferredencoding())
555
556         def to_cons_title(self, message):
557                 """Set console/terminal window title to message."""
558                 if not self.params.get('consoletitle', False):
559                         return
560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561                         # c_wchar_p() might not be necessary if `message` is
562                         # already of type unicode()
563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564                 elif 'TERM' in os.environ:
565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
567         def fixed_template(self):
568                 """Checks if the output template is fixed."""
569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
570
571         def trouble(self, message=None):
572                 """Determine action to take when a download problem appears.
573
574                 Depending on if the downloader has been configured to ignore
575                 download errors or not, this method may throw an exception or
576                 not when errors are found, after printing the message.
577                 """
578                 if message is not None:
579                         self.to_stderr(message)
580                 if not self.params.get('ignoreerrors', False):
581                         raise DownloadError(message)
582                 self._download_retcode = 1
583
584         def slow_down(self, start_time, byte_counter):
585                 """Sleep if the download speed is over the rate limit."""
586                 rate_limit = self.params.get('ratelimit', None)
587                 if rate_limit is None or byte_counter == 0:
588                         return
589                 now = time.time()
590                 elapsed = now - start_time
591                 if elapsed <= 0.0:
592                         return
593                 speed = float(byte_counter) / elapsed
594                 if speed > rate_limit:
595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
596
597         def temp_name(self, filename):
598                 """Returns a temporary filename for the given filename."""
599                 if self.params.get('nopart', False) or filename == u'-' or \
600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
601                         return filename
602                 return filename + u'.part'
603
604         def undo_temp_name(self, filename):
605                 if filename.endswith(u'.part'):
606                         return filename[:-len(u'.part')]
607                 return filename
608
609         def try_rename(self, old_filename, new_filename):
610                 try:
611                         if old_filename == new_filename:
612                                 return
613                         os.rename(old_filename, new_filename)
614                 except (IOError, OSError), err:
615                         self.trouble(u'ERROR: unable to rename file')
616
617         def try_utime(self, filename, last_modified_hdr):
618                 """Try to set the last-modified time of the given file."""
619                 if last_modified_hdr is None:
620                         return
621                 if not os.path.isfile(filename):
622                         return
623                 timestr = last_modified_hdr
624                 if timestr is None:
625                         return
626                 filetime = timeconvert(timestr)
627                 if filetime is None:
628                         return
629                 try:
630                         os.utime(filename, (time.time(), filetime))
631                 except:
632                         pass
633
634         def report_writedescription(self, descfn):
635                 """ Report that the description file is being written """
636                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
637
638         def report_writeinfojson(self, infofn):
639                 """ Report that the metadata file has been written """
640                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
641
642         def report_destination(self, filename):
643                 """Report destination filename."""
644                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
645
646         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
647                 """Report download progress."""
648                 if self.params.get('noprogress', False):
649                         return
650                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
651                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
652                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
653                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
654
655         def report_resuming_byte(self, resume_len):
656                 """Report attempt to resume at given byte."""
657                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
658
659         def report_retry(self, count, retries):
660                 """Report retry in case of HTTP error 5xx"""
661                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
662
663         def report_file_already_downloaded(self, file_name):
664                 """Report file has already been fully downloaded."""
665                 try:
666                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
667                 except (UnicodeEncodeError), err:
668                         self.to_screen(u'[download] The file has already been downloaded')
669
670         def report_unable_to_resume(self):
671                 """Report it was impossible to resume download."""
672                 self.to_screen(u'[download] Unable to resume')
673
674         def report_finish(self):
675                 """Report download finished."""
676                 if self.params.get('noprogress', False):
677                         self.to_screen(u'[download] Download completed')
678                 else:
679                         self.to_screen(u'')
680
681         def increment_downloads(self):
682                 """Increment the ordinal that assigns a number to each file."""
683                 self._num_downloads += 1
684
685         def prepare_filename(self, info_dict):
686                 """Generate the output filename."""
687                 try:
688                         template_dict = dict(info_dict)
689                         template_dict['epoch'] = unicode(long(time.time()))
690                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
691                         filename = self.params['outtmpl'] % template_dict
692                         return filename
693                 except (ValueError, KeyError), err:
694                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
695                         return None
696
697         def process_info(self, info_dict):
698                 """Process a single dictionary returned by an InfoExtractor."""
699                 filename = self.prepare_filename(info_dict)
700                 # Do nothing else if in simulate mode
701                 if self.params.get('simulate', False):
702                         # Forced printings
703                         if self.params.get('forcetitle', False):
704                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705                         if self.params.get('forceurl', False):
706                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709                         if self.params.get('forcedescription', False) and 'description' in info_dict:
710                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711                         if self.params.get('forcefilename', False) and filename is not None:
712                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713
714                         return
715
716                 if filename is None:
717                         return
718
719                 matchtitle=self.params.get('matchtitle',False)
720                 rejecttitle=self.params.get('rejecttitle',False)
721                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
722                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
723                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
724                         return
725                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
726                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
727                         return
728                         
729                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
730                         self.to_stderr(u'WARNING: file exists and will be skipped')
731                         return
732
733                 try:
734                         dn = os.path.dirname(filename)
735                         if dn != '' and not os.path.exists(dn):
736                                 os.makedirs(dn)
737                 except (OSError, IOError), err:
738                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
739                         return
740
741                 if self.params.get('writedescription', False):
742                         try:
743                                 descfn = filename + '.description'
744                                 self.report_writedescription(descfn)
745                                 descfile = open(descfn, 'wb')
746                                 try:
747                                         descfile.write(info_dict['description'].encode('utf-8'))
748                                 finally:
749                                         descfile.close()
750                         except (OSError, IOError):
751                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
752                                 return
753
754                 if self.params.get('writeinfojson', False):
755                         infofn = filename + '.info.json'
756                         self.report_writeinfojson(infofn)
757                         try:
758                                 json.dump
759                         except (NameError,AttributeError):
760                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
761                                 return
762                         try:
763                                 infof = open(infofn, 'wb')
764                                 try:
765                                         json.dump(info_dict, infof)
766                                 finally:
767                                         infof.close()
768                         except (OSError, IOError):
769                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
770                                 return
771
772                 try:
773                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
774                 except (OSError, IOError), err:
775                         raise UnavailableVideoError
776                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
777                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
778                         return
779                 except (ContentTooShortError, ), err:
780                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
781                         return
782
783                 if success:
784                         try:
785                                 self.post_process(filename, info_dict)
786                         except (PostProcessingError), err:
787                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
788                                 return
789
790         def download(self, url_list):
791                 """Download a given list of URLs."""
792                 if len(url_list) > 1 and self.fixed_template():
793                         raise SameFileError(self.params['outtmpl'])
794
795                 for url in url_list:
796                         suitable_found = False
797                         for ie in self._ies:
798                                 # Go to next InfoExtractor if not suitable
799                                 if not ie.suitable(url):
800                                         continue
801
802                                 # Suitable InfoExtractor found
803                                 suitable_found = True
804
805                                 # Extract information from URL and process it
806                                 ie.extract(url)
807
808                                 # Suitable InfoExtractor had been found; go to next URL
809                                 break
810
811                         if not suitable_found:
812                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
813
814                 return self._download_retcode
815
816         def post_process(self, filename, ie_info):
817                 """Run the postprocessing chain on the given file."""
818                 info = dict(ie_info)
819                 info['filepath'] = filename
820                 for pp in self._pps:
821                         info = pp.run(info)
822                         if info is None:
823                                 break
824
825         def _download_with_rtmpdump(self, filename, url, player_url):
826                 self.report_destination(filename)
827                 tmpfilename = self.temp_name(filename)
828
829                 # Check for rtmpdump first
830                 try:
831                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
832                 except (OSError, IOError):
833                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
834                         return False
835
836                 # Download using rtmpdump. rtmpdump returns exit code 2 when
837                 # the connection was interrumpted and resuming appears to be
838                 # possible. This is part of rtmpdump's normal usage, AFAIK.
839                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
840                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
841                 while retval == 2 or retval == 1:
842                         prevsize = os.path.getsize(tmpfilename)
843                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
844                         time.sleep(5.0) # This seems to be needed
845                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
846                         cursize = os.path.getsize(tmpfilename)
847                         if prevsize == cursize and retval == 1:
848                                 break
849                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
850                         if prevsize == cursize and retval == 2 and cursize > 1024:
851                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
852                                 retval = 0
853                                 break
854                 if retval == 0:
855                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
856                         self.try_rename(tmpfilename, filename)
857                         return True
858                 else:
859                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
860                         return False
861
862         def _do_download(self, filename, url, player_url):
863                 # Check file already present
864                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
865                         self.report_file_already_downloaded(filename)
866                         return True
867
868                 # Attempt to download using rtmpdump
869                 if url.startswith('rtmp'):
870                         return self._download_with_rtmpdump(filename, url, player_url)
871
872                 tmpfilename = self.temp_name(filename)
873                 stream = None
874                 open_mode = 'wb'
875
876                 # Do not include the Accept-Encoding header
877                 headers = {'Youtubedl-no-compression': 'True'}
878                 basic_request = urllib2.Request(url, None, headers)
879                 request = urllib2.Request(url, None, headers)
880
881                 # Establish possible resume length
882                 if os.path.isfile(tmpfilename):
883                         resume_len = os.path.getsize(tmpfilename)
884                 else:
885                         resume_len = 0
886
887                 # Request parameters in case of being able to resume
888                 if self.params.get('continuedl', False) and resume_len != 0:
889                         self.report_resuming_byte(resume_len)
890                         request.add_header('Range', 'bytes=%d-' % resume_len)
891                         open_mode = 'ab'
892
893                 count = 0
894                 retries = self.params.get('retries', 0)
895                 while count <= retries:
896                         # Establish connection
897                         try:
898                                 data = urllib2.urlopen(request)
899                                 break
900                         except (urllib2.HTTPError, ), err:
901                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
902                                         # Unexpected HTTP error
903                                         raise
904                                 elif err.code == 416:
905                                         # Unable to resume (requested range not satisfiable)
906                                         try:
907                                                 # Open the connection again without the range header
908                                                 data = urllib2.urlopen(basic_request)
909                                                 content_length = data.info()['Content-Length']
910                                         except (urllib2.HTTPError, ), err:
911                                                 if err.code < 500 or err.code >= 600:
912                                                         raise
913                                         else:
914                                                 # Examine the reported length
915                                                 if (content_length is not None and
916                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
917                                                         # The file had already been fully downloaded.
918                                                         # Explanation to the above condition: in issue #175 it was revealed that
919                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
920                                                         # changing the file size slightly and causing problems for some users. So
921                                                         # I decided to implement a suggested change and consider the file
922                                                         # completely downloaded if the file size differs less than 100 bytes from
923                                                         # the one in the hard drive.
924                                                         self.report_file_already_downloaded(filename)
925                                                         self.try_rename(tmpfilename, filename)
926                                                         return True
927                                                 else:
928                                                         # The length does not match, we start the download over
929                                                         self.report_unable_to_resume()
930                                                         open_mode = 'wb'
931                                                         break
932                         # Retry
933                         count += 1
934                         if count <= retries:
935                                 self.report_retry(count, retries)
936
937                 if count > retries:
938                         self.trouble(u'ERROR: giving up after %s retries' % retries)
939                         return False
940
941                 data_len = data.info().get('Content-length', None)
942                 if data_len is not None:
943                         data_len = long(data_len) + resume_len
944                 data_len_str = self.format_bytes(data_len)
945                 byte_counter = 0 + resume_len
946                 block_size = 1024
947                 start = time.time()
948                 while True:
949                         # Download and write
950                         before = time.time()
951                         data_block = data.read(block_size)
952                         after = time.time()
953                         if len(data_block) == 0:
954                                 break
955                         byte_counter += len(data_block)
956
957                         # Open file just in time
958                         if stream is None:
959                                 try:
960                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
961                                         assert stream is not None
962                                         filename = self.undo_temp_name(tmpfilename)
963                                         self.report_destination(filename)
964                                 except (OSError, IOError), err:
965                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
966                                         return False
967                         try:
968                                 stream.write(data_block)
969                         except (IOError, OSError), err:
970                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
971                                 return False
972                         block_size = self.best_block_size(after - before, len(data_block))
973
974                         # Progress message
975                         percent_str = self.calc_percent(byte_counter, data_len)
976                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
977                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
978                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
979
980                         # Apply rate limit
981                         self.slow_down(start, byte_counter - resume_len)
982
983                 if stream is None:
984                         self.trouble(u'\nERROR: Did not get any data blocks')
985                         return False
986                 stream.close()
987                 self.report_finish()
988                 if data_len is not None and byte_counter != data_len:
989                         raise ContentTooShortError(byte_counter, long(data_len))
990                 self.try_rename(tmpfilename, filename)
991
992                 # Update file modification time
993                 if self.params.get('updatetime', True):
994                         self.try_utime(filename, data.info().get('last-modified', None))
995
996                 return True
997
998
999 class InfoExtractor(object):
1000         """Information Extractor class.
1001
1002         Information extractors are the classes that, given a URL, extract
1003         information from the video (or videos) the URL refers to. This
1004         information includes the real video URL, the video title and simplified
1005         title, author and others. The information is stored in a dictionary
1006         which is then passed to the FileDownloader. The FileDownloader
1007         processes this information possibly downloading the video to the file
1008         system, among other possible outcomes. The dictionaries must include
1009         the following fields:
1010
1011         id:             Video identifier.
1012         url:            Final video URL.
1013         uploader:       Nickname of the video uploader.
1014         title:          Literal title.
1015         stitle:         Simplified title.
1016         ext:            Video filename extension.
1017         format:         Video format.
1018         player_url:     SWF Player URL (may be None).
1019
1020         The following fields are optional. Their primary purpose is to allow
1021         youtube-dl to serve as the backend for a video search function, such
1022         as the one in youtube2mp3.  They are only used when their respective
1023         forced printing functions are called:
1024
1025         thumbnail:      Full URL to a video thumbnail image.
1026         description:    One-line video description.
1027
1028         Subclasses of this one should re-define the _real_initialize() and
1029         _real_extract() methods, as well as the suitable() static method.
1030         Probably, they should also be instantiated and added to the main
1031         downloader.
1032         """
1033
1034         _ready = False
1035         _downloader = None
1036
1037         def __init__(self, downloader=None):
1038                 """Constructor. Receives an optional downloader."""
1039                 self._ready = False
1040                 self.set_downloader(downloader)
1041
1042         @staticmethod
1043         def suitable(url):
1044                 """Receives a URL and returns True if suitable for this IE."""
1045                 return False
1046
1047         def initialize(self):
1048                 """Initializes an instance (authentication, etc)."""
1049                 if not self._ready:
1050                         self._real_initialize()
1051                         self._ready = True
1052
1053         def extract(self, url):
1054                 """Extracts URL information and returns it in list of dicts."""
1055                 self.initialize()
1056                 return self._real_extract(url)
1057
1058         def set_downloader(self, downloader):
1059                 """Sets the downloader for this IE."""
1060                 self._downloader = downloader
1061
1062         def _real_initialize(self):
1063                 """Real initialization process. Redefine in subclasses."""
1064                 pass
1065
1066         def _real_extract(self, url):
1067                 """Real extraction process. Redefine in subclasses."""
1068                 pass
1069
1070
1071 class YoutubeIE(InfoExtractor):
1072         """Information extractor for youtube.com."""
1073
1074         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1075         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1076         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1077         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1078         _NETRC_MACHINE = 'youtube'
1079         # Listed in order of quality
1080         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1081         _video_extensions = {
1082                 '13': '3gp',
1083                 '17': 'mp4',
1084                 '18': 'mp4',
1085                 '22': 'mp4',
1086                 '37': 'mp4',
1087                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1088                 '43': 'webm',
1089                 '45': 'webm',
1090         }
1091
1092         @staticmethod
1093         def suitable(url):
1094                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1095
1096         def report_lang(self):
1097                 """Report attempt to set language."""
1098                 self._downloader.to_screen(u'[youtube] Setting language')
1099
1100         def report_login(self):
1101                 """Report attempt to log in."""
1102                 self._downloader.to_screen(u'[youtube] Logging in')
1103
1104         def report_age_confirmation(self):
1105                 """Report attempt to confirm age."""
1106                 self._downloader.to_screen(u'[youtube] Confirming age')
1107
1108         def report_video_webpage_download(self, video_id):
1109                 """Report attempt to download video webpage."""
1110                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1111
1112         def report_video_info_webpage_download(self, video_id):
1113                 """Report attempt to download video info webpage."""
1114                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1115
1116         def report_information_extraction(self, video_id):
1117                 """Report attempt to extract video information."""
1118                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1119
1120         def report_unavailable_format(self, video_id, format):
1121                 """Report extracted video URL."""
1122                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1123
1124         def report_rtmp_download(self):
1125                 """Indicate the download will use the RTMP protocol."""
1126                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1127
1128         def _real_initialize(self):
1129                 if self._downloader is None:
1130                         return
1131
1132                 username = None
1133                 password = None
1134                 downloader_params = self._downloader.params
1135
1136                 # Attempt to use provided username and password or .netrc data
1137                 if downloader_params.get('username', None) is not None:
1138                         username = downloader_params['username']
1139                         password = downloader_params['password']
1140                 elif downloader_params.get('usenetrc', False):
1141                         try:
1142                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1143                                 if info is not None:
1144                                         username = info[0]
1145                                         password = info[2]
1146                                 else:
1147                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1148                         except (IOError, netrc.NetrcParseError), err:
1149                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1150                                 return
1151
1152                 # Set language
1153                 request = urllib2.Request(self._LANG_URL)
1154                 try:
1155                         self.report_lang()
1156                         urllib2.urlopen(request).read()
1157                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1158                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1159                         return
1160
1161                 # No authentication to be performed
1162                 if username is None:
1163                         return
1164
1165                 # Log in
1166                 login_form = {
1167                                 'current_form': 'loginForm',
1168                                 'next':         '/',
1169                                 'action_login': 'Log In',
1170                                 'username':     username,
1171                                 'password':     password,
1172                                 }
1173                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1174                 try:
1175                         self.report_login()
1176                         login_results = urllib2.urlopen(request).read()
1177                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1178                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1179                                 return
1180                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1182                         return
1183
1184                 # Confirm age
1185                 age_form = {
1186                                 'next_url':             '/',
1187                                 'action_confirm':       'Confirm',
1188                                 }
1189                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1190                 try:
1191                         self.report_age_confirmation()
1192                         age_results = urllib2.urlopen(request).read()
1193                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1195                         return
1196
1197         def _real_extract(self, url):
1198                 # Extract video id from URL
1199                 mobj = re.match(self._VALID_URL, url)
1200                 if mobj is None:
1201                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1202                         return
1203                 video_id = mobj.group(2)
1204
1205                 # Get video webpage
1206                 self.report_video_webpage_download(video_id)
1207                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1208                 try:
1209                         video_webpage = urllib2.urlopen(request).read()
1210                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1212                         return
1213
1214                 # Attempt to extract SWF player URL
1215                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1216                 if mobj is not None:
1217                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1218                 else:
1219                         player_url = None
1220
1221                 # Get video info
1222                 self.report_video_info_webpage_download(video_id)
1223                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1224                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1225                                         % (video_id, el_type))
1226                         request = urllib2.Request(video_info_url)
1227                         try:
1228                                 video_info_webpage = urllib2.urlopen(request).read()
1229                                 video_info = parse_qs(video_info_webpage)
1230                                 if 'token' in video_info:
1231                                         break
1232                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1233                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1234                                 return
1235                 if 'token' not in video_info:
1236                         if 'reason' in video_info:
1237                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1238                         else:
1239                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1240                         return
1241
1242                 # Start extracting information
1243                 self.report_information_extraction(video_id)
1244
1245                 # uploader
1246                 if 'author' not in video_info:
1247                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1248                         return
1249                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1250
1251                 # title
1252                 if 'title' not in video_info:
1253                         self._downloader.trouble(u'ERROR: unable to extract video title')
1254                         return
1255                 video_title = urllib.unquote_plus(video_info['title'][0])
1256                 video_title = video_title.decode('utf-8')
1257                 video_title = sanitize_title(video_title)
1258
1259                 # simplified title
1260                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1261                 simple_title = simple_title.strip(ur'_')
1262
1263                 # thumbnail image
1264                 if 'thumbnail_url' not in video_info:
1265                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1266                         video_thumbnail = ''
1267                 else:   # don't panic if we can't find it
1268                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1269
1270                 # upload date
1271                 upload_date = u'NA'
1272                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1273                 if mobj is not None:
1274                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1275                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1276                         for expression in format_expressions:
1277                                 try:
1278                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1279                                 except:
1280                                         pass
1281
1282                 # description
1283                 try:
1284                         lxml.etree
1285                 except NameError:
1286                         video_description = u'No description available.'
1287                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1288                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1289                                 if mobj is not None:
1290                                         video_description = mobj.group(1).decode('utf-8')
1291                 else:
1292                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1293                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1294                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1295                         # TODO use another parser
1296
1297                 # token
1298                 video_token = urllib.unquote_plus(video_info['token'][0])
1299
1300                 # Decide which formats to download
1301                 req_format = self._downloader.params.get('format', None)
1302
1303                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1304                         self.report_rtmp_download()
1305                         video_url_list = [(None, video_info['conn'][0])]
1306                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1307                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1308                         url_data = [parse_qs(uds) for uds in url_data_strs]
1309                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1310                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1311
1312                         format_limit = self._downloader.params.get('format_limit', None)
1313                         if format_limit is not None and format_limit in self._available_formats:
1314                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1315                         else:
1316                                 format_list = self._available_formats
1317                         existing_formats = [x for x in format_list if x in url_map]
1318                         if len(existing_formats) == 0:
1319                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1320                                 return
1321                         if req_format is None:
1322                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1323                         elif req_format == '-1':
1324                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1325                         else:
1326                                 # Specific format
1327                                 if req_format not in url_map:
1328                                         self._downloader.trouble(u'ERROR: requested format not available')
1329                                         return
1330                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1331                 else:
1332                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1333                         return
1334
1335                 for format_param, video_real_url in video_url_list:
1336                         # At this point we have a new video
1337                         self._downloader.increment_downloads()
1338
1339                         # Extension
1340                         video_extension = self._video_extensions.get(format_param, 'flv')
1341
1342                         try:
1343                                 # Process video information
1344                                 self._downloader.process_info({
1345                                         'id':           video_id.decode('utf-8'),
1346                                         'url':          video_real_url.decode('utf-8'),
1347                                         'uploader':     video_uploader.decode('utf-8'),
1348                                         'upload_date':  upload_date,
1349                                         'title':        video_title,
1350                                         'stitle':       simple_title,
1351                                         'ext':          video_extension.decode('utf-8'),
1352                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1353                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1354                                         'description':  video_description,
1355                                         'player_url':   player_url,
1356                                 })
1357                         except UnavailableVideoError, err:
1358                                 self._downloader.trouble(u'\nERROR: unable to download video')
1359
1360
1361 class MetacafeIE(InfoExtractor):
1362         """Information Extractor for metacafe.com."""
1363
1364         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1365         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1366         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1367         _youtube_ie = None
1368
1369         def __init__(self, youtube_ie, downloader=None):
1370                 InfoExtractor.__init__(self, downloader)
1371                 self._youtube_ie = youtube_ie
1372
1373         @staticmethod
1374         def suitable(url):
1375                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1376
1377         def report_disclaimer(self):
1378                 """Report disclaimer retrieval."""
1379                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1380
1381         def report_age_confirmation(self):
1382                 """Report attempt to confirm age."""
1383                 self._downloader.to_screen(u'[metacafe] Confirming age')
1384
1385         def report_download_webpage(self, video_id):
1386                 """Report webpage download."""
1387                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1388
1389         def report_extraction(self, video_id):
1390                 """Report information extraction."""
1391                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1392
1393         def _real_initialize(self):
1394                 # Retrieve disclaimer
1395                 request = urllib2.Request(self._DISCLAIMER)
1396                 try:
1397                         self.report_disclaimer()
1398                         disclaimer = urllib2.urlopen(request).read()
1399                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1400                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1401                         return
1402
1403                 # Confirm age
1404                 disclaimer_form = {
1405                         'filters': '0',
1406                         'submit': "Continue - I'm over 18",
1407                         }
1408                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1409                 try:
1410                         self.report_age_confirmation()
1411                         disclaimer = urllib2.urlopen(request).read()
1412                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1413                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1414                         return
1415
1416         def _real_extract(self, url):
1417                 # Extract id and simplified title from URL
1418                 mobj = re.match(self._VALID_URL, url)
1419                 if mobj is None:
1420                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1421                         return
1422
1423                 video_id = mobj.group(1)
1424
1425                 # Check if video comes from YouTube
1426                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1427                 if mobj2 is not None:
1428                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1429                         return
1430
1431                 # At this point we have a new video
1432                 self._downloader.increment_downloads()
1433
1434                 simple_title = mobj.group(2).decode('utf-8')
1435
1436                 # Retrieve video webpage to extract further information
1437                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1438                 try:
1439                         self.report_download_webpage(video_id)
1440                         webpage = urllib2.urlopen(request).read()
1441                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1443                         return
1444
1445                 # Extract URL, uploader and title from webpage
1446                 self.report_extraction(video_id)
1447                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1448                 if mobj is not None:
1449                         mediaURL = urllib.unquote(mobj.group(1))
1450                         video_extension = mediaURL[-3:]
1451
1452                         # Extract gdaKey if available
1453                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1454                         if mobj is None:
1455                                 video_url = mediaURL
1456                         else:
1457                                 gdaKey = mobj.group(1)
1458                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1459                 else:
1460                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1461                         if mobj is None:
1462                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1463                                 return
1464                         vardict = parse_qs(mobj.group(1))
1465                         if 'mediaData' not in vardict:
1466                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1467                                 return
1468                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1469                         if mobj is None:
1470                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1471                                 return
1472                         mediaURL = mobj.group(1).replace('\\/', '/')
1473                         video_extension = mediaURL[-3:]
1474                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1475
1476                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1477                 if mobj is None:
1478                         self._downloader.trouble(u'ERROR: unable to extract title')
1479                         return
1480                 video_title = mobj.group(1).decode('utf-8')
1481                 video_title = sanitize_title(video_title)
1482
1483                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1484                 if mobj is None:
1485                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1486                         return
1487                 video_uploader = mobj.group(1)
1488
1489                 try:
1490                         # Process video information
1491                         self._downloader.process_info({
1492                                 'id':           video_id.decode('utf-8'),
1493                                 'url':          video_url.decode('utf-8'),
1494                                 'uploader':     video_uploader.decode('utf-8'),
1495                                 'upload_date':  u'NA',
1496                                 'title':        video_title,
1497                                 'stitle':       simple_title,
1498                                 'ext':          video_extension.decode('utf-8'),
1499                                 'format':       u'NA',
1500                                 'player_url':   None,
1501                         })
1502                 except UnavailableVideoError:
1503                         self._downloader.trouble(u'\nERROR: unable to download video')
1504
1505
1506 class DailymotionIE(InfoExtractor):
1507         """Information Extractor for Dailymotion"""
1508
1509         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1510
1511         def __init__(self, downloader=None):
1512                 InfoExtractor.__init__(self, downloader)
1513
1514         @staticmethod
1515         def suitable(url):
1516                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1517
1518         def report_download_webpage(self, video_id):
1519                 """Report webpage download."""
1520                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1521
1522         def report_extraction(self, video_id):
1523                 """Report information extraction."""
1524                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1525
1526         def _real_initialize(self):
1527                 return
1528
1529         def _real_extract(self, url):
1530                 # Extract id and simplified title from URL
1531                 mobj = re.match(self._VALID_URL, url)
1532                 if mobj is None:
1533                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1534                         return
1535
1536                 # At this point we have a new video
1537                 self._downloader.increment_downloads()
1538                 video_id = mobj.group(1)
1539
1540                 simple_title = mobj.group(2).decode('utf-8')
1541                 video_extension = 'flv'
1542
1543                 # Retrieve video webpage to extract further information
1544                 request = urllib2.Request(url)
1545                 request.add_header('Cookie', 'family_filter=off')
1546                 try:
1547                         self.report_download_webpage(video_id)
1548                         webpage = urllib2.urlopen(request).read()
1549                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1550                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1551                         return
1552
1553                 # Extract URL, uploader and title from webpage
1554                 self.report_extraction(video_id)
1555                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1556                 if mobj is None:
1557                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1558                         return
1559                 sequence = urllib.unquote(mobj.group(1))
1560                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1561                 if mobj is None:
1562                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1563                         return
1564                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1565
1566                 # if needed add http://www.dailymotion.com/ if relative URL
1567
1568                 video_url = mediaURL
1569
1570                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1571                 if mobj is None:
1572                         self._downloader.trouble(u'ERROR: unable to extract title')
1573                         return
1574                 video_title = mobj.group(1).decode('utf-8')
1575                 video_title = sanitize_title(video_title)
1576
1577                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1578                 if mobj is None:
1579                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1580                         return
1581                 video_uploader = mobj.group(1)
1582
1583                 try:
1584                         # Process video information
1585                         self._downloader.process_info({
1586                                 'id':           video_id.decode('utf-8'),
1587                                 'url':          video_url.decode('utf-8'),
1588                                 'uploader':     video_uploader.decode('utf-8'),
1589                                 'upload_date':  u'NA',
1590                                 'title':        video_title,
1591                                 'stitle':       simple_title,
1592                                 'ext':          video_extension.decode('utf-8'),
1593                                 'format':       u'NA',
1594                                 'player_url':   None,
1595                         })
1596                 except UnavailableVideoError:
1597                         self._downloader.trouble(u'\nERROR: unable to download video')
1598
1599
1600 class GoogleIE(InfoExtractor):
1601         """Information extractor for video.google.com."""
1602
1603         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1604
1605         def __init__(self, downloader=None):
1606                 InfoExtractor.__init__(self, downloader)
1607
1608         @staticmethod
1609         def suitable(url):
1610                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1611
1612         def report_download_webpage(self, video_id):
1613                 """Report webpage download."""
1614                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1615
1616         def report_extraction(self, video_id):
1617                 """Report information extraction."""
1618                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1619
1620         def _real_initialize(self):
1621                 return
1622
1623         def _real_extract(self, url):
1624                 # Extract id from URL
1625                 mobj = re.match(self._VALID_URL, url)
1626                 if mobj is None:
1627                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1628                         return
1629
1630                 # At this point we have a new video
1631                 self._downloader.increment_downloads()
1632                 video_id = mobj.group(1)
1633
1634                 video_extension = 'mp4'
1635
1636                 # Retrieve video webpage to extract further information
1637                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1638                 try:
1639                         self.report_download_webpage(video_id)
1640                         webpage = urllib2.urlopen(request).read()
1641                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1642                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1643                         return
1644
1645                 # Extract URL, uploader, and title from webpage
1646                 self.report_extraction(video_id)
1647                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1648                 if mobj is None:
1649                         video_extension = 'flv'
1650                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1651                 if mobj is None:
1652                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1653                         return
1654                 mediaURL = urllib.unquote(mobj.group(1))
1655                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1656                 mediaURL = mediaURL.replace('\\x26', '\x26')
1657
1658                 video_url = mediaURL
1659
1660                 mobj = re.search(r'<title>(.*)</title>', webpage)
1661                 if mobj is None:
1662                         self._downloader.trouble(u'ERROR: unable to extract title')
1663                         return
1664                 video_title = mobj.group(1).decode('utf-8')
1665                 video_title = sanitize_title(video_title)
1666                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1667
1668                 # Extract video description
1669                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1670                 if mobj is None:
1671                         self._downloader.trouble(u'ERROR: unable to extract video description')
1672                         return
1673                 video_description = mobj.group(1).decode('utf-8')
1674                 if not video_description:
1675                         video_description = 'No description available.'
1676
1677                 # Extract video thumbnail
1678                 if self._downloader.params.get('forcethumbnail', False):
1679                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1680                         try:
1681                                 webpage = urllib2.urlopen(request).read()
1682                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1684                                 return
1685                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1686                         if mobj is None:
1687                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1688                                 return
1689                         video_thumbnail = mobj.group(1)
1690                 else:   # we need something to pass to process_info
1691                         video_thumbnail = ''
1692
1693                 try:
1694                         # Process video information
1695                         self._downloader.process_info({
1696                                 'id':           video_id.decode('utf-8'),
1697                                 'url':          video_url.decode('utf-8'),
1698                                 'uploader':     u'NA',
1699                                 'upload_date':  u'NA',
1700                                 'title':        video_title,
1701                                 'stitle':       simple_title,
1702                                 'ext':          video_extension.decode('utf-8'),
1703                                 'format':       u'NA',
1704                                 'player_url':   None,
1705                         })
1706                 except UnavailableVideoError:
1707                         self._downloader.trouble(u'\nERROR: unable to download video')
1708
1709
1710 class PhotobucketIE(InfoExtractor):
1711         """Information extractor for photobucket.com."""
1712
1713         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1714
1715         def __init__(self, downloader=None):
1716                 InfoExtractor.__init__(self, downloader)
1717
1718         @staticmethod
1719         def suitable(url):
1720                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1721
1722         def report_download_webpage(self, video_id):
1723                 """Report webpage download."""
1724                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1725
1726         def report_extraction(self, video_id):
1727                 """Report information extraction."""
1728                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1729
1730         def _real_initialize(self):
1731                 return
1732
1733         def _real_extract(self, url):
1734                 # Extract id from URL
1735                 mobj = re.match(self._VALID_URL, url)
1736                 if mobj is None:
1737                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1738                         return
1739
1740                 # At this point we have a new video
1741                 self._downloader.increment_downloads()
1742                 video_id = mobj.group(1)
1743
1744                 video_extension = 'flv'
1745
1746                 # Retrieve video webpage to extract further information
1747                 request = urllib2.Request(url)
1748                 try:
1749                         self.report_download_webpage(video_id)
1750                         webpage = urllib2.urlopen(request).read()
1751                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1752                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1753                         return
1754
1755                 # Extract URL, uploader, and title from webpage
1756                 self.report_extraction(video_id)
1757                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1758                 if mobj is None:
1759                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1760                         return
1761                 mediaURL = urllib.unquote(mobj.group(1))
1762
1763                 video_url = mediaURL
1764
1765                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1766                 if mobj is None:
1767                         self._downloader.trouble(u'ERROR: unable to extract title')
1768                         return
1769                 video_title = mobj.group(1).decode('utf-8')
1770                 video_title = sanitize_title(video_title)
1771                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1772
1773                 video_uploader = mobj.group(2).decode('utf-8')
1774
1775                 try:
1776                         # Process video information
1777                         self._downloader.process_info({
1778                                 'id':           video_id.decode('utf-8'),
1779                                 'url':          video_url.decode('utf-8'),
1780                                 'uploader':     video_uploader,
1781                                 'upload_date':  u'NA',
1782                                 'title':        video_title,
1783                                 'stitle':       simple_title,
1784                                 'ext':          video_extension.decode('utf-8'),
1785                                 'format':       u'NA',
1786                                 'player_url':   None,
1787                         })
1788                 except UnavailableVideoError:
1789                         self._downloader.trouble(u'\nERROR: unable to download video')
1790
1791
1792 class YahooIE(InfoExtractor):
1793         """Information extractor for video.yahoo.com."""
1794
1795         # _VALID_URL matches all Yahoo! Video URLs
1796         # _VPAGE_URL matches only the extractable '/watch/' URLs
1797         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1798         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1799
1800         def __init__(self, downloader=None):
1801                 InfoExtractor.__init__(self, downloader)
1802
1803         @staticmethod
1804         def suitable(url):
1805                 return (re.match(YahooIE._VALID_URL, url) is not None)
1806
1807         def report_download_webpage(self, video_id):
1808                 """Report webpage download."""
1809                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1810
1811         def report_extraction(self, video_id):
1812                 """Report information extraction."""
1813                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1814
1815         def _real_initialize(self):
1816                 return
1817
1818         def _real_extract(self, url, new_video=True):
1819                 # Extract ID from URL
1820                 mobj = re.match(self._VALID_URL, url)
1821                 if mobj is None:
1822                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1823                         return
1824
1825                 # At this point we have a new video
1826                 self._downloader.increment_downloads()
1827                 video_id = mobj.group(2)
1828                 video_extension = 'flv'
1829
1830                 # Rewrite valid but non-extractable URLs as
1831                 # extractable English language /watch/ URLs
1832                 if re.match(self._VPAGE_URL, url) is None:
1833                         request = urllib2.Request(url)
1834                         try:
1835                                 webpage = urllib2.urlopen(request).read()
1836                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1837                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1838                                 return
1839
1840                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1841                         if mobj is None:
1842                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1843                                 return
1844                         yahoo_id = mobj.group(1)
1845
1846                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1847                         if mobj is None:
1848                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1849                                 return
1850                         yahoo_vid = mobj.group(1)
1851
1852                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1853                         return self._real_extract(url, new_video=False)
1854
1855                 # Retrieve video webpage to extract further information
1856                 request = urllib2.Request(url)
1857                 try:
1858                         self.report_download_webpage(video_id)
1859                         webpage = urllib2.urlopen(request).read()
1860                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1861                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1862                         return
1863
1864                 # Extract uploader and title from webpage
1865                 self.report_extraction(video_id)
1866                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1867                 if mobj is None:
1868                         self._downloader.trouble(u'ERROR: unable to extract video title')
1869                         return
1870                 video_title = mobj.group(1).decode('utf-8')
1871                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1872
1873                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1874                 if mobj is None:
1875                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1876                         return
1877                 video_uploader = mobj.group(1).decode('utf-8')
1878
1879                 # Extract video thumbnail
1880                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1881                 if mobj is None:
1882                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1883                         return
1884                 video_thumbnail = mobj.group(1).decode('utf-8')
1885
1886                 # Extract video description
1887                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1888                 if mobj is None:
1889                         self._downloader.trouble(u'ERROR: unable to extract video description')
1890                         return
1891                 video_description = mobj.group(1).decode('utf-8')
1892                 if not video_description:
1893                         video_description = 'No description available.'
1894
1895                 # Extract video height and width
1896                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1897                 if mobj is None:
1898                         self._downloader.trouble(u'ERROR: unable to extract video height')
1899                         return
1900                 yv_video_height = mobj.group(1)
1901
1902                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1903                 if mobj is None:
1904                         self._downloader.trouble(u'ERROR: unable to extract video width')
1905                         return
1906                 yv_video_width = mobj.group(1)
1907
1908                 # Retrieve video playlist to extract media URL
1909                 # I'm not completely sure what all these options are, but we
1910                 # seem to need most of them, otherwise the server sends a 401.
1911                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1912                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1913                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1914                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1915                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1916                 try:
1917                         self.report_download_webpage(video_id)
1918                         webpage = urllib2.urlopen(request).read()
1919                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1920                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1921                         return
1922
1923                 # Extract media URL from playlist XML
1924                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1925                 if mobj is None:
1926                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1927                         return
1928                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1929                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1930
1931                 try:
1932                         # Process video information
1933                         self._downloader.process_info({
1934                                 'id':           video_id.decode('utf-8'),
1935                                 'url':          video_url,
1936                                 'uploader':     video_uploader,
1937                                 'upload_date':  u'NA',
1938                                 'title':        video_title,
1939                                 'stitle':       simple_title,
1940                                 'ext':          video_extension.decode('utf-8'),
1941                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1942                                 'description':  video_description,
1943                                 'thumbnail':    video_thumbnail,
1944                                 'player_url':   None,
1945                         })
1946                 except UnavailableVideoError:
1947                         self._downloader.trouble(u'\nERROR: unable to download video')
1948
1949
1950 class VimeoIE(InfoExtractor):
1951         """Information extractor for vimeo.com."""
1952
1953         # _VALID_URL matches Vimeo URLs
1954         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1955
1956         def __init__(self, downloader=None):
1957                 InfoExtractor.__init__(self, downloader)
1958
1959         @staticmethod
1960         def suitable(url):
1961                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1962
1963         def report_download_webpage(self, video_id):
1964                 """Report webpage download."""
1965                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1966
1967         def report_extraction(self, video_id):
1968                 """Report information extraction."""
1969                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1970
1971         def _real_initialize(self):
1972                 return
1973
1974         def _real_extract(self, url, new_video=True):
1975                 # Extract ID from URL
1976                 mobj = re.match(self._VALID_URL, url)
1977                 if mobj is None:
1978                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1979                         return
1980
1981                 # At this point we have a new video
1982                 self._downloader.increment_downloads()
1983                 video_id = mobj.group(1)
1984
1985                 # Retrieve video webpage to extract further information
1986                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1987                 try:
1988                         self.report_download_webpage(video_id)
1989                         webpage = urllib2.urlopen(request).read()
1990                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1991                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1992                         return
1993
1994                 # Now we begin extracting as much information as we can from what we
1995                 # retrieved. First we extract the information common to all extractors,
1996                 # and latter we extract those that are Vimeo specific.
1997                 self.report_extraction(video_id)
1998
1999                 # Extract title
2000                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2001                 if mobj is None:
2002                         self._downloader.trouble(u'ERROR: unable to extract video title')
2003                         return
2004                 video_title = mobj.group(1).decode('utf-8')
2005                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2006
2007                 # Extract uploader
2008                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2009                 if mobj is None:
2010                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2011                         return
2012                 video_uploader = mobj.group(1).decode('utf-8')
2013
2014                 # Extract video thumbnail
2015                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2016                 if mobj is None:
2017                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2018                         return
2019                 video_thumbnail = mobj.group(1).decode('utf-8')
2020
2021                 # # Extract video description
2022                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2023                 # if mobj is None:
2024                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2025                 #       return
2026                 # video_description = mobj.group(1).decode('utf-8')
2027                 # if not video_description: video_description = 'No description available.'
2028                 video_description = 'Foo.'
2029
2030                 # Vimeo specific: extract request signature
2031                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2032                 if mobj is None:
2033                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2034                         return
2035                 sig = mobj.group(1).decode('utf-8')
2036
2037                 # Vimeo specific: Extract request signature expiration
2038                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2039                 if mobj is None:
2040                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2041                         return
2042                 sig_exp = mobj.group(1).decode('utf-8')
2043
2044                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2045
2046                 try:
2047                         # Process video information
2048                         self._downloader.process_info({
2049                                 'id':           video_id.decode('utf-8'),
2050                                 'url':          video_url,
2051                                 'uploader':     video_uploader,
2052                                 'upload_date':  u'NA',
2053                                 'title':        video_title,
2054                                 'stitle':       simple_title,
2055                                 'ext':          u'mp4',
2056                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2057                                 'description':  video_description,
2058                                 'thumbnail':    video_thumbnail,
2059                                 'description':  video_description,
2060                                 'player_url':   None,
2061                         })
2062                 except UnavailableVideoError:
2063                         self._downloader.trouble(u'ERROR: unable to download video')
2064
2065
2066 class GenericIE(InfoExtractor):
2067         """Generic last-resort information extractor."""
2068
2069         def __init__(self, downloader=None):
2070                 InfoExtractor.__init__(self, downloader)
2071
2072         @staticmethod
2073         def suitable(url):
2074                 return True
2075
2076         def report_download_webpage(self, video_id):
2077                 """Report webpage download."""
2078                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2079                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2080
2081         def report_extraction(self, video_id):
2082                 """Report information extraction."""
2083                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2084
2085         def _real_initialize(self):
2086                 return
2087
2088         def _real_extract(self, url):
2089                 # At this point we have a new video
2090                 self._downloader.increment_downloads()
2091
2092                 video_id = url.split('/')[-1]
2093                 request = urllib2.Request(url)
2094                 try:
2095                         self.report_download_webpage(video_id)
2096                         webpage = urllib2.urlopen(request).read()
2097                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2098                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2099                         return
2100                 except ValueError, err:
2101                         # since this is the last-resort InfoExtractor, if
2102                         # this error is thrown, it'll be thrown here
2103                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2104                         return
2105
2106                 self.report_extraction(video_id)
2107                 # Start with something easy: JW Player in SWFObject
2108                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2109                 if mobj is None:
2110                         # Broaden the search a little bit
2111                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2112                 if mobj is None:
2113                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2114                         return
2115
2116                 # It's possible that one of the regexes
2117                 # matched, but returned an empty group:
2118                 if mobj.group(1) is None:
2119                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2120                         return
2121
2122                 video_url = urllib.unquote(mobj.group(1))
2123                 video_id = os.path.basename(video_url)
2124
2125                 # here's a fun little line of code for you:
2126                 video_extension = os.path.splitext(video_id)[1][1:]
2127                 video_id = os.path.splitext(video_id)[0]
2128
2129                 # it's tempting to parse this further, but you would
2130                 # have to take into account all the variations like
2131                 #   Video Title - Site Name
2132                 #   Site Name | Video Title
2133                 #   Video Title - Tagline | Site Name
2134                 # and so on and so forth; it's just not practical
2135                 mobj = re.search(r'<title>(.*)</title>', webpage)
2136                 if mobj is None:
2137                         self._downloader.trouble(u'ERROR: unable to extract title')
2138                         return
2139                 video_title = mobj.group(1).decode('utf-8')
2140                 video_title = sanitize_title(video_title)
2141                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2142
2143                 # video uploader is domain name
2144                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2145                 if mobj is None:
2146                         self._downloader.trouble(u'ERROR: unable to extract title')
2147                         return
2148                 video_uploader = mobj.group(1).decode('utf-8')
2149
2150                 try:
2151                         # Process video information
2152                         self._downloader.process_info({
2153                                 'id':           video_id.decode('utf-8'),
2154                                 'url':          video_url.decode('utf-8'),
2155                                 'uploader':     video_uploader,
2156                                 'upload_date':  u'NA',
2157                                 'title':        video_title,
2158                                 'stitle':       simple_title,
2159                                 'ext':          video_extension.decode('utf-8'),
2160                                 'format':       u'NA',
2161                                 'player_url':   None,
2162                         })
2163                 except UnavailableVideoError, err:
2164                         self._downloader.trouble(u'\nERROR: unable to download video')
2165
2166
2167 class YoutubeSearchIE(InfoExtractor):
2168         """Information Extractor for YouTube search queries."""
2169         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2170         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2171         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2172         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2173         _youtube_ie = None
2174         _max_youtube_results = 1000
2175
2176         def __init__(self, youtube_ie, downloader=None):
2177                 InfoExtractor.__init__(self, downloader)
2178                 self._youtube_ie = youtube_ie
2179
2180         @staticmethod
2181         def suitable(url):
2182                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2183
2184         def report_download_page(self, query, pagenum):
2185                 """Report attempt to download playlist page with given number."""
2186                 query = query.decode(preferredencoding())
2187                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2188
2189         def _real_initialize(self):
2190                 self._youtube_ie.initialize()
2191
2192         def _real_extract(self, query):
2193                 mobj = re.match(self._VALID_QUERY, query)
2194                 if mobj is None:
2195                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2196                         return
2197
2198                 prefix, query = query.split(':')
2199                 prefix = prefix[8:]
2200                 query = query.encode('utf-8')
2201                 if prefix == '':
2202                         self._download_n_results(query, 1)
2203                         return
2204                 elif prefix == 'all':
2205                         self._download_n_results(query, self._max_youtube_results)
2206                         return
2207                 else:
2208                         try:
2209                                 n = long(prefix)
2210                                 if n <= 0:
2211                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2212                                         return
2213                                 elif n > self._max_youtube_results:
2214                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2215                                         n = self._max_youtube_results
2216                                 self._download_n_results(query, n)
2217                                 return
2218                         except ValueError: # parsing prefix as integer fails
2219                                 self._download_n_results(query, 1)
2220                                 return
2221
2222         def _download_n_results(self, query, n):
2223                 """Downloads a specified number of results for a query"""
2224
2225                 video_ids = []
2226                 already_seen = set()
2227                 pagenum = 1
2228
2229                 while True:
2230                         self.report_download_page(query, pagenum)
2231                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2232                         request = urllib2.Request(result_url)
2233                         try:
2234                                 page = urllib2.urlopen(request).read()
2235                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2236                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2237                                 return
2238
2239                         # Extract video identifiers
2240                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2241                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2242                                 if video_id not in already_seen:
2243                                         video_ids.append(video_id)
2244                                         already_seen.add(video_id)
2245                                         if len(video_ids) == n:
2246                                                 # Specified n videos reached
2247                                                 for id in video_ids:
2248                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2249                                                 return
2250
2251                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2252                                 for id in video_ids:
2253                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2254                                 return
2255
2256                         pagenum = pagenum + 1
2257
2258
2259 class GoogleSearchIE(InfoExtractor):
2260         """Information Extractor for Google Video search queries."""
2261         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2262         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2263         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2264         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2265         _google_ie = None
2266         _max_google_results = 1000
2267
2268         def __init__(self, google_ie, downloader=None):
2269                 InfoExtractor.__init__(self, downloader)
2270                 self._google_ie = google_ie
2271
2272         @staticmethod
2273         def suitable(url):
2274                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2275
2276         def report_download_page(self, query, pagenum):
2277                 """Report attempt to download playlist page with given number."""
2278                 query = query.decode(preferredencoding())
2279                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2280
2281         def _real_initialize(self):
2282                 self._google_ie.initialize()
2283
2284         def _real_extract(self, query):
2285                 mobj = re.match(self._VALID_QUERY, query)
2286                 if mobj is None:
2287                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2288                         return
2289
2290                 prefix, query = query.split(':')
2291                 prefix = prefix[8:]
2292                 query = query.encode('utf-8')
2293                 if prefix == '':
2294                         self._download_n_results(query, 1)
2295                         return
2296                 elif prefix == 'all':
2297                         self._download_n_results(query, self._max_google_results)
2298                         return
2299                 else:
2300                         try:
2301                                 n = long(prefix)
2302                                 if n <= 0:
2303                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2304                                         return
2305                                 elif n > self._max_google_results:
2306                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2307                                         n = self._max_google_results
2308                                 self._download_n_results(query, n)
2309                                 return
2310                         except ValueError: # parsing prefix as integer fails
2311                                 self._download_n_results(query, 1)
2312                                 return
2313
2314         def _download_n_results(self, query, n):
2315                 """Downloads a specified number of results for a query"""
2316
2317                 video_ids = []
2318                 already_seen = set()
2319                 pagenum = 1
2320
2321                 while True:
2322                         self.report_download_page(query, pagenum)
2323                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2324                         request = urllib2.Request(result_url)
2325                         try:
2326                                 page = urllib2.urlopen(request).read()
2327                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2328                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2329                                 return
2330
2331                         # Extract video identifiers
2332                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2333                                 video_id = mobj.group(1)
2334                                 if video_id not in already_seen:
2335                                         video_ids.append(video_id)
2336                                         already_seen.add(video_id)
2337                                         if len(video_ids) == n:
2338                                                 # Specified n videos reached
2339                                                 for id in video_ids:
2340                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2341                                                 return
2342
2343                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2344                                 for id in video_ids:
2345                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2346                                 return
2347
2348                         pagenum = pagenum + 1
2349
2350
2351 class YahooSearchIE(InfoExtractor):
2352         """Information Extractor for Yahoo! Video search queries."""
2353         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2354         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2355         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2356         _MORE_PAGES_INDICATOR = r'\s*Next'
2357         _yahoo_ie = None
2358         _max_yahoo_results = 1000
2359
2360         def __init__(self, yahoo_ie, downloader=None):
2361                 InfoExtractor.__init__(self, downloader)
2362                 self._yahoo_ie = yahoo_ie
2363
2364         @staticmethod
2365         def suitable(url):
2366                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2367
2368         def report_download_page(self, query, pagenum):
2369                 """Report attempt to download playlist page with given number."""
2370                 query = query.decode(preferredencoding())
2371                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2372
2373         def _real_initialize(self):
2374                 self._yahoo_ie.initialize()
2375
2376         def _real_extract(self, query):
2377                 mobj = re.match(self._VALID_QUERY, query)
2378                 if mobj is None:
2379                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2380                         return
2381
2382                 prefix, query = query.split(':')
2383                 prefix = prefix[8:]
2384                 query = query.encode('utf-8')
2385                 if prefix == '':
2386                         self._download_n_results(query, 1)
2387                         return
2388                 elif prefix == 'all':
2389                         self._download_n_results(query, self._max_yahoo_results)
2390                         return
2391                 else:
2392                         try:
2393                                 n = long(prefix)
2394                                 if n <= 0:
2395                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2396                                         return
2397                                 elif n > self._max_yahoo_results:
2398                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2399                                         n = self._max_yahoo_results
2400                                 self._download_n_results(query, n)
2401                                 return
2402                         except ValueError: # parsing prefix as integer fails
2403                                 self._download_n_results(query, 1)
2404                                 return
2405
2406         def _download_n_results(self, query, n):
2407                 """Downloads a specified number of results for a query"""
2408
2409                 video_ids = []
2410                 already_seen = set()
2411                 pagenum = 1
2412
2413                 while True:
2414                         self.report_download_page(query, pagenum)
2415                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2416                         request = urllib2.Request(result_url)
2417                         try:
2418                                 page = urllib2.urlopen(request).read()
2419                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2420                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2421                                 return
2422
2423                         # Extract video identifiers
2424                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2425                                 video_id = mobj.group(1)
2426                                 if video_id not in already_seen:
2427                                         video_ids.append(video_id)
2428                                         already_seen.add(video_id)
2429                                         if len(video_ids) == n:
2430                                                 # Specified n videos reached
2431                                                 for id in video_ids:
2432                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2433                                                 return
2434
2435                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2436                                 for id in video_ids:
2437                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2438                                 return
2439
2440                         pagenum = pagenum + 1
2441
2442
2443 class YoutubePlaylistIE(InfoExtractor):
2444         """Information Extractor for YouTube playlists."""
2445
2446         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2447         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2448         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2449         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2450         _youtube_ie = None
2451
2452         def __init__(self, youtube_ie, downloader=None):
2453                 InfoExtractor.__init__(self, downloader)
2454                 self._youtube_ie = youtube_ie
2455
2456         @staticmethod
2457         def suitable(url):
2458                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2459
2460         def report_download_page(self, playlist_id, pagenum):
2461                 """Report attempt to download playlist page with given number."""
2462                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2463
2464         def _real_initialize(self):
2465                 self._youtube_ie.initialize()
2466
2467         def _real_extract(self, url):
2468                 # Extract playlist id
2469                 mobj = re.match(self._VALID_URL, url)
2470                 if mobj is None:
2471                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2472                         return
2473
2474                 # Single video case
2475                 if mobj.group(3) is not None:
2476                         self._youtube_ie.extract(mobj.group(3))
2477                         return
2478
2479                 # Download playlist pages
2480                 # prefix is 'p' as default for playlists but there are other types that need extra care
2481                 playlist_prefix = mobj.group(1)
2482                 if playlist_prefix == 'a':
2483                         playlist_access = 'artist'
2484                 else:
2485                         playlist_prefix = 'p'
2486                         playlist_access = 'view_play_list'
2487                 playlist_id = mobj.group(2)
2488                 video_ids = []
2489                 pagenum = 1
2490
2491                 while True:
2492                         self.report_download_page(playlist_id, pagenum)
2493                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2494                         try:
2495                                 page = urllib2.urlopen(request).read()
2496                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2497                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2498                                 return
2499
2500                         # Extract video identifiers
2501                         ids_in_page = []
2502                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2503                                 if mobj.group(1) not in ids_in_page:
2504                                         ids_in_page.append(mobj.group(1))
2505                         video_ids.extend(ids_in_page)
2506
2507                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2508                                 break
2509                         pagenum = pagenum + 1
2510
2511                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2512                 playlistend = self._downloader.params.get('playlistend', -1)
2513                 video_ids = video_ids[playliststart:playlistend]
2514
2515                 for id in video_ids:
2516                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2517                 return
2518
2519
2520 class YoutubeUserIE(InfoExtractor):
2521         """Information Extractor for YouTube users."""
2522
2523         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2524         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2525         _GDATA_PAGE_SIZE = 50
2526         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2527         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2528         _youtube_ie = None
2529
2530         def __init__(self, youtube_ie, downloader=None):
2531                 InfoExtractor.__init__(self, downloader)
2532                 self._youtube_ie = youtube_ie
2533
2534         @staticmethod
2535         def suitable(url):
2536                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2537
2538         def report_download_page(self, username, start_index):
2539                 """Report attempt to download user page."""
2540                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2541                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2542
2543         def _real_initialize(self):
2544                 self._youtube_ie.initialize()
2545
2546         def _real_extract(self, url):
2547                 # Extract username
2548                 mobj = re.match(self._VALID_URL, url)
2549                 if mobj is None:
2550                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2551                         return
2552
2553                 username = mobj.group(1)
2554
2555                 # Download video ids using YouTube Data API. Result size per
2556                 # query is limited (currently to 50 videos) so we need to query
2557                 # page by page until there are no video ids - it means we got
2558                 # all of them.
2559
2560                 video_ids = []
2561                 pagenum = 0
2562
2563                 while True:
2564                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2565                         self.report_download_page(username, start_index)
2566
2567                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2568
2569                         try:
2570                                 page = urllib2.urlopen(request).read()
2571                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2572                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2573                                 return
2574
2575                         # Extract video identifiers
2576                         ids_in_page = []
2577
2578                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2579                                 if mobj.group(1) not in ids_in_page:
2580                                         ids_in_page.append(mobj.group(1))
2581
2582                         video_ids.extend(ids_in_page)
2583
2584                         # A little optimization - if current page is not
2585                         # "full", ie. does not contain PAGE_SIZE video ids then
2586                         # we can assume that this page is the last one - there
2587                         # are no more ids on further pages - no need to query
2588                         # again.
2589
2590                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2591                                 break
2592
2593                         pagenum += 1
2594
2595                 all_ids_count = len(video_ids)
2596                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2597                 playlistend = self._downloader.params.get('playlistend', -1)
2598
2599                 if playlistend == -1:
2600                         video_ids = video_ids[playliststart:]
2601                 else:
2602                         video_ids = video_ids[playliststart:playlistend]
2603
2604                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2605                                 (username, all_ids_count, len(video_ids)))
2606
2607                 for video_id in video_ids:
2608                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2609
2610
2611 class DepositFilesIE(InfoExtractor):
2612         """Information extractor for depositfiles.com"""
2613
2614         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2615
2616         def __init__(self, downloader=None):
2617                 InfoExtractor.__init__(self, downloader)
2618
2619         @staticmethod
2620         def suitable(url):
2621                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2622
2623         def report_download_webpage(self, file_id):
2624                 """Report webpage download."""
2625                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2626
2627         def report_extraction(self, file_id):
2628                 """Report information extraction."""
2629                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2630
2631         def _real_initialize(self):
2632                 return
2633
2634         def _real_extract(self, url):
2635                 # At this point we have a new file
2636                 self._downloader.increment_downloads()
2637
2638                 file_id = url.split('/')[-1]
2639                 # Rebuild url in english locale
2640                 url = 'http://depositfiles.com/en/files/' + file_id
2641
2642                 # Retrieve file webpage with 'Free download' button pressed
2643                 free_download_indication = { 'gateway_result' : '1' }
2644                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2645                 try:
2646                         self.report_download_webpage(file_id)
2647                         webpage = urllib2.urlopen(request).read()
2648                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2649                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2650                         return
2651
2652                 # Search for the real file URL
2653                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2654                 if (mobj is None) or (mobj.group(1) is None):
2655                         # Try to figure out reason of the error.
2656                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2657                         if (mobj is not None) and (mobj.group(1) is not None):
2658                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2659                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2660                         else:
2661                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2662                         return
2663
2664                 file_url = mobj.group(1)
2665                 file_extension = os.path.splitext(file_url)[1][1:]
2666
2667                 # Search for file title
2668                 mobj = re.search(r'<b title="(.*?)">', webpage)
2669                 if mobj is None:
2670                         self._downloader.trouble(u'ERROR: unable to extract title')
2671                         return
2672                 file_title = mobj.group(1).decode('utf-8')
2673
2674                 try:
2675                         # Process file information
2676                         self._downloader.process_info({
2677                                 'id':           file_id.decode('utf-8'),
2678                                 'url':          file_url.decode('utf-8'),
2679                                 'uploader':     u'NA',
2680                                 'upload_date':  u'NA',
2681                                 'title':        file_title,
2682                                 'stitle':       file_title,
2683                                 'ext':          file_extension.decode('utf-8'),
2684                                 'format':       u'NA',
2685                                 'player_url':   None,
2686                         })
2687                 except UnavailableVideoError, err:
2688                         self._downloader.trouble(u'ERROR: unable to download file')
2689
2690
2691 class FacebookIE(InfoExtractor):
2692         """Information Extractor for Facebook"""
2693
2694         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2695         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2696         _NETRC_MACHINE = 'facebook'
2697         _available_formats = ['highqual', 'lowqual']
2698         _video_extensions = {
2699                 'highqual': 'mp4',
2700                 'lowqual': 'mp4',
2701         }
2702
2703         def __init__(self, downloader=None):
2704                 InfoExtractor.__init__(self, downloader)
2705
2706         @staticmethod
2707         def suitable(url):
2708                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2709
2710         def _reporter(self, message):
2711                 """Add header and report message."""
2712                 self._downloader.to_screen(u'[facebook] %s' % message)
2713
2714         def report_login(self):
2715                 """Report attempt to log in."""
2716                 self._reporter(u'Logging in')
2717
2718         def report_video_webpage_download(self, video_id):
2719                 """Report attempt to download video webpage."""
2720                 self._reporter(u'%s: Downloading video webpage' % video_id)
2721
2722         def report_information_extraction(self, video_id):
2723                 """Report attempt to extract video information."""
2724                 self._reporter(u'%s: Extracting video information' % video_id)
2725
2726         def _parse_page(self, video_webpage):
2727                 """Extract video information from page"""
2728                 # General data
2729                 data = {'title': r'class="video_title datawrap">(.*?)</',
2730                         'description': r'<div class="datawrap">(.*?)</div>',
2731                         'owner': r'\("video_owner_name", "(.*?)"\)',
2732                         'upload_date': r'data-date="(.*?)"',
2733                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2734                         }
2735                 video_info = {}
2736                 for piece in data.keys():
2737                         mobj = re.search(data[piece], video_webpage)
2738                         if mobj is not None:
2739                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2740
2741                 # Video urls
2742                 video_urls = {}
2743                 for fmt in self._available_formats:
2744                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2745                         if mobj is not None:
2746                                 # URL is in a Javascript segment inside an escaped Unicode format within
2747                                 # the generally utf-8 page
2748                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2749                 video_info['video_urls'] = video_urls
2750
2751                 return video_info
2752
2753         def _real_initialize(self):
2754                 if self._downloader is None:
2755                         return
2756
2757                 useremail = None
2758                 password = None
2759                 downloader_params = self._downloader.params
2760
2761                 # Attempt to use provided username and password or .netrc data
2762                 if downloader_params.get('username', None) is not None:
2763                         useremail = downloader_params['username']
2764                         password = downloader_params['password']
2765                 elif downloader_params.get('usenetrc', False):
2766                         try:
2767                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2768                                 if info is not None:
2769                                         useremail = info[0]
2770                                         password = info[2]
2771                                 else:
2772                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2773                         except (IOError, netrc.NetrcParseError), err:
2774                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2775                                 return
2776
2777                 if useremail is None:
2778                         return
2779
2780                 # Log in
2781                 login_form = {
2782                         'email': useremail,
2783                         'pass': password,
2784                         'login': 'Log+In'
2785                         }
2786                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2787                 try:
2788                         self.report_login()
2789                         login_results = urllib2.urlopen(request).read()
2790                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2791                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2792                                 return
2793                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2794                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2795                         return
2796
2797         def _real_extract(self, url):
2798                 mobj = re.match(self._VALID_URL, url)
2799                 if mobj is None:
2800                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2801                         return
2802                 video_id = mobj.group('ID')
2803
2804                 # Get video webpage
2805                 self.report_video_webpage_download(video_id)
2806                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2807                 try:
2808                         page = urllib2.urlopen(request)
2809                         video_webpage = page.read()
2810                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2811                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2812                         return
2813
2814                 # Start extracting information
2815                 self.report_information_extraction(video_id)
2816
2817                 # Extract information
2818                 video_info = self._parse_page(video_webpage)
2819
2820                 # uploader
2821                 if 'owner' not in video_info:
2822                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2823                         return
2824                 video_uploader = video_info['owner']
2825
2826                 # title
2827                 if 'title' not in video_info:
2828                         self._downloader.trouble(u'ERROR: unable to extract video title')
2829                         return
2830                 video_title = video_info['title']
2831                 video_title = video_title.decode('utf-8')
2832                 video_title = sanitize_title(video_title)
2833
2834                 # simplified title
2835                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2836                 simple_title = simple_title.strip(ur'_')
2837
2838                 # thumbnail image
2839                 if 'thumbnail' not in video_info:
2840                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2841                         video_thumbnail = ''
2842                 else:
2843                         video_thumbnail = video_info['thumbnail']
2844
2845                 # upload date
2846                 upload_date = u'NA'
2847                 if 'upload_date' in video_info:
2848                         upload_time = video_info['upload_date']
2849                         timetuple = email.utils.parsedate_tz(upload_time)
2850                         if timetuple is not None:
2851                                 try:
2852                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2853                                 except:
2854                                         pass
2855
2856                 # description
2857                 video_description = video_info.get('description', 'No description available.')
2858
2859                 url_map = video_info['video_urls']
2860                 if len(url_map.keys()) > 0:
2861                         # Decide which formats to download
2862                         req_format = self._downloader.params.get('format', None)
2863                         format_limit = self._downloader.params.get('format_limit', None)
2864
2865                         if format_limit is not None and format_limit in self._available_formats:
2866                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2867                         else:
2868                                 format_list = self._available_formats
2869                         existing_formats = [x for x in format_list if x in url_map]
2870                         if len(existing_formats) == 0:
2871                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2872                                 return
2873                         if req_format is None:
2874                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2875                         elif req_format == '-1':
2876                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2877                         else:
2878                                 # Specific format
2879                                 if req_format not in url_map:
2880                                         self._downloader.trouble(u'ERROR: requested format not available')
2881                                         return
2882                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2883
2884                 for format_param, video_real_url in video_url_list:
2885
2886                         # At this point we have a new video
2887                         self._downloader.increment_downloads()
2888
2889                         # Extension
2890                         video_extension = self._video_extensions.get(format_param, 'mp4')
2891
2892                         try:
2893                                 # Process video information
2894                                 self._downloader.process_info({
2895                                         'id':           video_id.decode('utf-8'),
2896                                         'url':          video_real_url.decode('utf-8'),
2897                                         'uploader':     video_uploader.decode('utf-8'),
2898                                         'upload_date':  upload_date,
2899                                         'title':        video_title,
2900                                         'stitle':       simple_title,
2901                                         'ext':          video_extension.decode('utf-8'),
2902                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2903                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2904                                         'description':  video_description.decode('utf-8'),
2905                                         'player_url':   None,
2906                                 })
2907                         except UnavailableVideoError, err:
2908                                 self._downloader.trouble(u'\nERROR: unable to download video')
2909
2910 class BlipTVIE(InfoExtractor):
2911         """Information extractor for blip.tv"""
2912
2913         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2914         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2915
2916         @staticmethod
2917         def suitable(url):
2918                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2919
2920         def report_extraction(self, file_id):
2921                 """Report information extraction."""
2922                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2923
2924         def _simplify_title(self, title):
2925                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2926                 res = res.strip(ur'_')
2927                 return res
2928
2929         def _real_extract(self, url):
2930                 mobj = re.match(self._VALID_URL, url)
2931                 if mobj is None:
2932                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2933                         return
2934
2935                 if '?' in url:
2936                         cchar = '&'
2937                 else:
2938                         cchar = '?'
2939                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2940                 request = urllib2.Request(json_url)
2941                 self.report_extraction(mobj.group(1))
2942                 try:
2943                         json_code = urllib2.urlopen(request).read()
2944                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2945                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2946                         return
2947                 try:
2948                         json_data = json.loads(json_code)
2949                         if 'Post' in json_data:
2950                                 data = json_data['Post']
2951                         else:
2952                                 data = json_data
2953
2954                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2955                         video_url = data['media']['url']
2956                         umobj = re.match(self._URL_EXT, video_url)
2957                         if umobj is None:
2958                                 raise ValueError('Can not determine filename extension')
2959                         ext = umobj.group(1)
2960
2961                         self._downloader.increment_downloads()
2962
2963                         info = {
2964                                 'id': data['item_id'],
2965                                 'url': video_url,
2966                                 'uploader': data['display_name'],
2967                                 'upload_date': upload_date,
2968                                 'title': data['title'],
2969                                 'stitle': self._simplify_title(data['title']),
2970                                 'ext': ext,
2971                                 'format': data['media']['mimeType'],
2972                                 'thumbnail': data['thumbnailUrl'],
2973                                 'description': data['description'],
2974                                 'player_url': data['embedUrl']
2975                         }
2976                 except (ValueError,KeyError), err:
2977                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2978                         return
2979
2980                 try:
2981                         self._downloader.process_info(info)
2982                 except UnavailableVideoError, err:
2983                         self._downloader.trouble(u'\nERROR: unable to download video')
2984
2985
2986 class MyVideoIE(InfoExtractor):
2987         """Information Extractor for myvideo.de."""
2988
2989         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2990
2991         def __init__(self, downloader=None):
2992                 InfoExtractor.__init__(self, downloader)
2993         
2994         @staticmethod
2995         def suitable(url):
2996                 return (re.match(MyVideoIE._VALID_URL, url) is not None)
2997
2998         def report_download_webpage(self, video_id):
2999                 """Report webpage download."""
3000                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3001
3002         def report_extraction(self, video_id):
3003                 """Report information extraction."""
3004                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3005
3006         def _real_initialize(self):
3007                 return
3008
3009         def _real_extract(self,url):
3010                 mobj = re.match(self._VALID_URL, url)
3011                 if mobj is None:
3012                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3013                         return
3014
3015                 video_id = mobj.group(1)
3016                 simple_title = mobj.group(2).decode('utf-8')
3017                 # should actually not be necessary
3018                 simple_title = sanitize_title(simple_title)
3019                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3020
3021                 # Get video webpage
3022                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3023                 try:
3024                         self.report_download_webpage(video_id)
3025                         webpage = urllib2.urlopen(request).read()
3026                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3027                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3028                         return
3029
3030                 self.report_extraction(video_id)
3031                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3032                                  webpage)
3033                 if mobj is None:
3034                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3035                         return
3036                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3037
3038                 mobj = re.search('<title>([^<]+)</title>', webpage)
3039                 if mobj is None:
3040                         self._downloader.trouble(u'ERROR: unable to extract title')
3041                         return
3042
3043                 video_title = mobj.group(1)
3044                 video_title = sanitize_title(video_title)
3045
3046                 try:
3047                         print(video_url)
3048                         self._downloader.process_info({
3049                                 'id':           video_id,
3050                                 'url':          video_url,
3051                                 'uploader':     u'NA',
3052                                 'upload_date':  u'NA',
3053                                 'title':        video_title,
3054                                 'stitle':       simple_title,
3055                                 'ext':          u'flv',
3056                                 'format':       u'NA',
3057                                 'player_url':   None,
3058                         })
3059                 except UnavailableVideoError:
3060                         self._downloader.trouble(u'\nERROR: Unable to download video')
3061
3062 class ComedyCentralIE(InfoExtractor):
3063         """Information extractor for The Daily Show and Colbert Report """
3064
3065         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3066
3067         @staticmethod
3068         def suitable(url):
3069                 return (re.match(ComedyCentralIE._VALID_URL, url) is not None)
3070
3071         def report_extraction(self, episode_id):
3072                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3073         
3074         def report_config_download(self, episode_id):
3075                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3076
3077         def report_index_download(self, episode_id):
3078                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3079
3080         def report_player_url(self, episode_id):
3081                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3082
3083         def _simplify_title(self, title):
3084                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3085                 res = res.strip(ur'_')
3086                 return res
3087
3088         def _real_extract(self, url):
3089                 mobj = re.match(self._VALID_URL, url)
3090                 if mobj is None:
3091                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3092                         return
3093
3094                 if mobj.group('shortname'):
3095                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3096                                 url = 'http://www.thedailyshow.com/full-episodes/'
3097                         else:
3098                                 url = 'http://www.colbertnation.com/full-episodes/'
3099                         mobj = re.match(self._VALID_URL, url)
3100                         assert mobj is not None
3101
3102                 dlNewest = not mobj.group('episode')
3103                 if dlNewest:
3104                         epTitle = mobj.group('showname')
3105                 else:
3106                         epTitle = mobj.group('episode')
3107
3108                 req = urllib2.Request(url)
3109                 self.report_extraction(epTitle)
3110                 try:
3111                         htmlHandle = urllib2.urlopen(req)
3112                         html = htmlHandle.read()
3113                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3114                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3115                         return
3116                 if dlNewest:
3117                         url = htmlHandle.geturl()
3118                         mobj = re.match(self._VALID_URL, url)
3119                         if mobj is None:
3120                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3121                                 return
3122                         if mobj.group('episode') == '':
3123                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3124                                 return
3125                         epTitle = mobj.group('episode')
3126
3127                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3128                 if len(mMovieParams) == 0:
3129                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3130                         return
3131
3132                 playerUrl_raw = mMovieParams[0][0]
3133                 self.report_player_url(epTitle)
3134                 try:
3135                         urlHandle = urllib2.urlopen(playerUrl_raw)
3136                         playerUrl = urlHandle.geturl()
3137                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3138                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3139                         return
3140
3141                 uri = mMovieParams[0][1]
3142                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3143                 self.report_index_download(epTitle)
3144                 try:
3145                         indexXml = urllib2.urlopen(indexUrl).read()
3146                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3147                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3148                         return
3149
3150                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3151                 itemEls = idoc.findall('.//item')
3152                 for itemEl in itemEls:
3153                         mediaId = itemEl.findall('./guid')[0].text
3154                         shortMediaId = mediaId.split(':')[-1]
3155                         showId = mediaId.split(':')[-2].replace('.com', '')
3156                         officialTitle = itemEl.findall('./title')[0].text
3157                         officialDate = itemEl.findall('./pubDate')[0].text
3158
3159                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3160                                                 urllib.urlencode({'uri': mediaId}))
3161                         configReq = urllib2.Request(configUrl)
3162                         self.report_config_download(epTitle)
3163                         try:
3164                                 configXml = urllib2.urlopen(configReq).read()
3165                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3166                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3167                                 return
3168
3169                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3170                         turls = []
3171                         for rendition in cdoc.findall('.//rendition'):
3172                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3173                                 turls.append(finfo)
3174
3175                         if len(turls) == 0:
3176                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3177                                 continue
3178
3179                         # For now, just pick the highest bitrate
3180                         format,video_url = turls[-1]
3181
3182                         self._downloader.increment_downloads()
3183
3184                         effTitle = showId + '-' + epTitle
3185                         info = {
3186                                 'id': shortMediaId,
3187                                 'url': video_url,
3188                                 'uploader': showId,
3189                                 'upload_date': officialDate,
3190                                 'title': effTitle,
3191                                 'stitle': self._simplify_title(effTitle),
3192                                 'ext': 'mp4',
3193                                 'format': format,
3194                                 'thumbnail': None,
3195                                 'description': officialTitle,
3196                                 'player_url': playerUrl
3197                         }
3198
3199                         try:
3200                                 self._downloader.process_info(info)
3201                         except UnavailableVideoError, err:
3202                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3203                                 continue
3204
3205
3206 class EscapistIE(InfoExtractor):
3207         """Information extractor for The Escapist """
3208
3209         _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3210
3211         @staticmethod
3212         def suitable(url):
3213                 return (re.match(EscapistIE._VALID_URL, url) is not None)
3214
3215         def report_extraction(self, showName):
3216                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3217
3218         def report_config_download(self, showName):
3219                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3220
3221         def _simplify_title(self, title):
3222                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3223                 res = res.strip(ur'_')
3224                 return res
3225
3226         def _real_extract(self, url):
3227                 htmlParser = HTMLParser.HTMLParser()
3228
3229                 mobj = re.match(self._VALID_URL, url)
3230                 if mobj is None:
3231                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3232                         return
3233                 showName = mobj.group('showname')
3234                 videoId = mobj.group('episode')
3235
3236                 self.report_extraction(showName)
3237                 try:
3238                         webPage = urllib2.urlopen(url).read()
3239                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3240                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3241                         return
3242
3243                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3244                 description = htmlParser.unescape(descMatch.group(1))
3245                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3246                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3247                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3248                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3249                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3250                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3251
3252                 self.report_config_download(showName)
3253                 try:
3254                         configJSON = urllib2.urlopen(configUrl).read()
3255                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3256                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3257                         return
3258
3259                 # Technically, it's JavaScript, not JSON
3260                 configJSON = configJSON.replace("'", '"')
3261
3262                 try:
3263                         config = json.loads(configJSON)
3264                 except (ValueError,), err:
3265                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3266                         return
3267
3268                 playlist = config['playlist']
3269                 videoUrl = playlist[1]['url']
3270
3271                 self._downloader.increment_downloads()
3272                 info = {
3273                         'id': videoId,
3274                         'url': videoUrl,
3275                         'uploader': showName,
3276                         'upload_date': None,
3277                         'title': showName,
3278                         'stitle': self._simplify_title(showName),
3279                         'ext': 'flv',
3280                         'format': 'flv',
3281                         'thumbnail': imgUrl,
3282                         'description': description,
3283                         'player_url': playerUrl,
3284                 }
3285
3286                 try:
3287                         self._downloader.process_info(info)
3288                 except UnavailableVideoError, err:
3289                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3290
3291
3292
3293 class PostProcessor(object):
3294         """Post Processor class.
3295
3296         PostProcessor objects can be added to downloaders with their
3297         add_post_processor() method. When the downloader has finished a
3298         successful download, it will take its internal chain of PostProcessors
3299         and start calling the run() method on each one of them, first with
3300         an initial argument and then with the returned value of the previous
3301         PostProcessor.
3302
3303         The chain will be stopped if one of them ever returns None or the end
3304         of the chain is reached.
3305
3306         PostProcessor objects follow a "mutual registration" process similar
3307         to InfoExtractor objects.
3308         """
3309
3310         _downloader = None
3311
3312         def __init__(self, downloader=None):
3313                 self._downloader = downloader
3314
3315         def set_downloader(self, downloader):
3316                 """Sets the downloader for this PP."""
3317                 self._downloader = downloader
3318
3319         def run(self, information):
3320                 """Run the PostProcessor.
3321
3322                 The "information" argument is a dictionary like the ones
3323                 composed by InfoExtractors. The only difference is that this
3324                 one has an extra field called "filepath" that points to the
3325                 downloaded file.
3326
3327                 When this method returns None, the postprocessing chain is
3328                 stopped. However, this method may return an information
3329                 dictionary that will be passed to the next postprocessing
3330                 object in the chain. It can be the one it received after
3331                 changing some fields.
3332
3333                 In addition, this method may raise a PostProcessingError
3334                 exception that will be taken into account by the downloader
3335                 it was called from.
3336                 """
3337                 return information # by default, do nothing
3338
3339
3340 class FFmpegExtractAudioPP(PostProcessor):
3341
3342         def __init__(self, downloader=None, preferredcodec=None):
3343                 PostProcessor.__init__(self, downloader)
3344                 if preferredcodec is None:
3345                         preferredcodec = 'best'
3346                 self._preferredcodec = preferredcodec
3347
3348         @staticmethod
3349         def get_audio_codec(path):
3350                 try:
3351                         cmd = ['ffprobe', '-show_streams', '--', path]
3352                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3353                         output = handle.communicate()[0]
3354                         if handle.wait() != 0:
3355                                 return None
3356                 except (IOError, OSError):
3357                         return None
3358                 audio_codec = None
3359                 for line in output.split('\n'):
3360                         if line.startswith('codec_name='):
3361                                 audio_codec = line.split('=')[1].strip()
3362                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3363                                 return audio_codec
3364                 return None
3365
3366         @staticmethod
3367         def run_ffmpeg(path, out_path, codec, more_opts):
3368                 try:
3369                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3370                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3371                         return (ret == 0)
3372                 except (IOError, OSError):
3373                         return False
3374
3375         def run(self, information):
3376                 path = information['filepath']
3377
3378                 filecodec = self.get_audio_codec(path)
3379                 if filecodec is None:
3380                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3381                         return None
3382
3383                 more_opts = []
3384                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3385                         if filecodec == 'aac' or filecodec == 'mp3':
3386                                 # Lossless if possible
3387                                 acodec = 'copy'
3388                                 extension = filecodec
3389                                 if filecodec == 'aac':
3390                                         more_opts = ['-f', 'adts']
3391                         else:
3392                                 # MP3 otherwise.
3393                                 acodec = 'libmp3lame'
3394                                 extension = 'mp3'
3395                                 more_opts = ['-ab', '128k']
3396                 else:
3397                         # We convert the audio (lossy)
3398                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3399                         extension = self._preferredcodec
3400                         more_opts = ['-ab', '128k']
3401                         if self._preferredcodec == 'aac':
3402                                 more_opts += ['-f', 'adts']
3403
3404                 (prefix, ext) = os.path.splitext(path)
3405                 new_path = prefix + '.' + extension
3406                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3407                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3408
3409                 if not status:
3410                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3411                         return None
3412
3413                 try:
3414                         os.remove(path)
3415                 except (IOError, OSError):
3416                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3417                         return None
3418
3419                 information['filepath'] = new_path
3420                 return information
3421
3422
3423 def updateSelf(downloader, filename):
3424         ''' Update the program file with the latest version from the repository '''
3425         # Note: downloader only used for options
3426         if not os.access(filename, os.W_OK):
3427                 sys.exit('ERROR: no write permissions on %s' % filename)
3428
3429         downloader.to_screen('Updating to latest version...')
3430
3431         try:
3432                 try:
3433                         urlh = urllib.urlopen(UPDATE_URL)
3434                         newcontent = urlh.read()
3435                 finally:
3436                         urlh.close()
3437         except (IOError, OSError), err:
3438                 sys.exit('ERROR: unable to download latest version')
3439
3440         try:
3441                 outf = open(filename, 'wb')
3442                 try:
3443                         outf.write(newcontent)
3444                 finally:
3445                         outf.close()
3446         except (IOError, OSError), err:
3447                 sys.exit('ERROR: unable to overwrite current version')
3448
3449         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3450
3451 def parseOpts():
3452         # Deferred imports
3453         import getpass
3454         import optparse
3455
3456         def _format_option_string(option):
3457                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3458
3459                 opts = []
3460
3461                 if option._short_opts: opts.append(option._short_opts[0])
3462                 if option._long_opts: opts.append(option._long_opts[0])
3463                 if len(opts) > 1: opts.insert(1, ', ')
3464
3465                 if option.takes_value(): opts.append(' %s' % option.metavar)
3466
3467                 return "".join(opts)
3468
3469         def _find_term_columns():
3470                 columns = os.environ.get('COLUMNS', None)
3471                 if columns:
3472                         return int(columns)
3473
3474                 try:
3475                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3476                         out,err = sp.communicate()
3477                         return int(out.split()[1])
3478                 except:
3479                         pass
3480                 return None
3481
3482         max_width = 80
3483         max_help_position = 80
3484
3485         # No need to wrap help messages if we're on a wide console
3486         columns = _find_term_columns()
3487         if columns: max_width = columns
3488
3489         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3490         fmt.format_option_strings = _format_option_string
3491
3492         kw = {
3493                 'version'   : __version__,
3494                 'formatter' : fmt,
3495                 'usage' : '%prog [options] url [url...]',
3496                 'conflict_handler' : 'resolve',
3497         }
3498
3499         parser = optparse.OptionParser(**kw)
3500
3501         # option groups
3502         general        = optparse.OptionGroup(parser, 'General Options')
3503         selection      = optparse.OptionGroup(parser, 'Video Selection')
3504         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3505         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3506         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3507         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3508         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3509
3510         general.add_option('-h', '--help',
3511                         action='help', help='print this help text and exit')
3512         general.add_option('-v', '--version',
3513                         action='version', help='print program version and exit')
3514         general.add_option('-U', '--update',
3515                         action='store_true', dest='update_self', help='update this program to latest version')
3516         general.add_option('-i', '--ignore-errors',
3517                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3518         general.add_option('-r', '--rate-limit',
3519                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3520         general.add_option('-R', '--retries',
3521                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3522         general.add_option('--dump-user-agent',
3523                         action='store_true', dest='dump_user_agent',
3524                         help='display the current browser identification', default=False)
3525
3526         selection.add_option('--playlist-start',
3527                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3528         selection.add_option('--playlist-end',
3529                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3530         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3531         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3532
3533         authentication.add_option('-u', '--username',
3534                         dest='username', metavar='USERNAME', help='account username')
3535         authentication.add_option('-p', '--password',
3536                         dest='password', metavar='PASSWORD', help='account password')
3537         authentication.add_option('-n', '--netrc',
3538                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3539
3540
3541         video_format.add_option('-f', '--format',
3542                         action='store', dest='format', metavar='FORMAT', help='video format code')
3543         video_format.add_option('--all-formats',
3544                         action='store_const', dest='format', help='download all available video formats', const='-1')
3545         video_format.add_option('--max-quality',
3546                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3547
3548
3549         verbosity.add_option('-q', '--quiet',
3550                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3551         verbosity.add_option('-s', '--simulate',
3552                         action='store_true', dest='simulate', help='do not download video', default=False)
3553         verbosity.add_option('-g', '--get-url',
3554                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3555         verbosity.add_option('-e', '--get-title',
3556                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3557         verbosity.add_option('--get-thumbnail',
3558                         action='store_true', dest='getthumbnail',
3559                         help='simulate, quiet but print thumbnail URL', default=False)
3560         verbosity.add_option('--get-description',
3561                         action='store_true', dest='getdescription',
3562                         help='simulate, quiet but print video description', default=False)
3563         verbosity.add_option('--get-filename',
3564                         action='store_true', dest='getfilename',
3565                         help='simulate, quiet but print output filename', default=False)
3566         verbosity.add_option('--no-progress',
3567                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3568         verbosity.add_option('--console-title',
3569                         action='store_true', dest='consoletitle',
3570                         help='display progress in console titlebar', default=False)
3571
3572
3573         filesystem.add_option('-t', '--title',
3574                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3575         filesystem.add_option('-l', '--literal',
3576                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3577         filesystem.add_option('-A', '--auto-number',
3578                         action='store_true', dest='autonumber',
3579                         help='number downloaded files starting from 00000', default=False)
3580         filesystem.add_option('-o', '--output',
3581                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3582         filesystem.add_option('-a', '--batch-file',
3583                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3584         filesystem.add_option('-w', '--no-overwrites',
3585                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3586         filesystem.add_option('-c', '--continue',
3587                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3588         filesystem.add_option('--cookies',
3589                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3590         filesystem.add_option('--no-part',
3591                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3592         filesystem.add_option('--no-mtime',
3593                         action='store_false', dest='updatetime',
3594                         help='do not use the Last-modified header to set the file modification time', default=True)
3595         filesystem.add_option('--write-description',
3596                         action='store_true', dest='writedescription',
3597                         help='write video description to a .description file', default=False)
3598         filesystem.add_option('--write-info-json',
3599                         action='store_true', dest='writeinfojson',
3600                         help='write video metadata to a .info.json file', default=False)
3601
3602
3603         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3604                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3605         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3606                         help='"best", "aac" or "mp3"; best by default')
3607
3608
3609         parser.add_option_group(general)
3610         parser.add_option_group(selection)
3611         parser.add_option_group(filesystem)
3612         parser.add_option_group(verbosity)
3613         parser.add_option_group(video_format)
3614         parser.add_option_group(authentication)
3615         parser.add_option_group(postproc)
3616
3617         opts, args = parser.parse_args()
3618
3619         return parser, opts, args
3620
3621 def main():
3622         parser, opts, args = parseOpts()
3623
3624         # Open appropriate CookieJar
3625         if opts.cookiefile is None:
3626                 jar = cookielib.CookieJar()
3627         else:
3628                 try:
3629                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3630                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3631                                 jar.load()
3632                 except (IOError, OSError), err:
3633                         sys.exit(u'ERROR: unable to open cookie file')
3634
3635         # Dump user agent
3636         if opts.dump_user_agent:
3637                 print std_headers['User-Agent']
3638                 sys.exit(0)
3639
3640         # General configuration
3641         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3642         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3643         urllib2.install_opener(opener)
3644         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3645
3646         # Batch file verification
3647         batchurls = []
3648         if opts.batchfile is not None:
3649                 try:
3650                         if opts.batchfile == '-':
3651                                 batchfd = sys.stdin
3652                         else:
3653                                 batchfd = open(opts.batchfile, 'r')
3654                         batchurls = batchfd.readlines()
3655                         batchurls = [x.strip() for x in batchurls]
3656                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3657                 except IOError:
3658                         sys.exit(u'ERROR: batch file could not be read')
3659         all_urls = batchurls + args
3660
3661         # Conflicting, missing and erroneous options
3662         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3663                 parser.error(u'using .netrc conflicts with giving username/password')
3664         if opts.password is not None and opts.username is None:
3665                 parser.error(u'account username missing')
3666         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3667                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3668         if opts.usetitle and opts.useliteral:
3669                 parser.error(u'using title conflicts with using literal title')
3670         if opts.username is not None and opts.password is None:
3671                 opts.password = getpass.getpass(u'Type account password and press return:')
3672         if opts.ratelimit is not None:
3673                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3674                 if numeric_limit is None:
3675                         parser.error(u'invalid rate limit specified')
3676                 opts.ratelimit = numeric_limit
3677         if opts.retries is not None:
3678                 try:
3679                         opts.retries = long(opts.retries)
3680                 except (TypeError, ValueError), err:
3681                         parser.error(u'invalid retry count specified')
3682         try:
3683                 opts.playliststart = int(opts.playliststart)
3684                 if opts.playliststart <= 0:
3685                         raise ValueError(u'Playlist start must be positive')
3686         except (TypeError, ValueError), err:
3687                 parser.error(u'invalid playlist start number specified')
3688         try:
3689                 opts.playlistend = int(opts.playlistend)
3690                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3691                         raise ValueError(u'Playlist end must be greater than playlist start')
3692         except (TypeError, ValueError), err:
3693                 parser.error(u'invalid playlist end number specified')
3694         if opts.extractaudio:
3695                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3696                         parser.error(u'invalid audio format specified')
3697
3698         # Information extractors
3699         youtube_ie = YoutubeIE()
3700         google_ie = GoogleIE()
3701         yahoo_ie = YahooIE()
3702         extractors = [ # Order does matter
3703                 youtube_ie,
3704                 MetacafeIE(youtube_ie),
3705                 DailymotionIE(),
3706                 YoutubePlaylistIE(youtube_ie),
3707                 YoutubeUserIE(youtube_ie),
3708                 YoutubeSearchIE(youtube_ie),
3709                 google_ie,
3710                 GoogleSearchIE(google_ie),
3711                 PhotobucketIE(),
3712                 yahoo_ie,
3713                 YahooSearchIE(yahoo_ie),
3714                 DepositFilesIE(),
3715                 FacebookIE(),
3716                 BlipTVIE(),
3717                 VimeoIE(),
3718                 MyVideoIE(),
3719                 ComedyCentralIE(),
3720                 EscapistIE(),
3721
3722                 GenericIE()
3723         ]
3724
3725         # File downloader
3726         fd = FileDownloader({
3727                 'usenetrc': opts.usenetrc,
3728                 'username': opts.username,
3729                 'password': opts.password,
3730                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3731                 'forceurl': opts.geturl,
3732                 'forcetitle': opts.gettitle,
3733                 'forcethumbnail': opts.getthumbnail,
3734                 'forcedescription': opts.getdescription,
3735                 'forcefilename': opts.getfilename,
3736                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3737                 'format': opts.format,
3738                 'format_limit': opts.format_limit,
3739                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3740                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3741                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3742                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3743                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3744                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3745                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3746                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3747                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3748                         or u'%(id)s.%(ext)s'),
3749                 'ignoreerrors': opts.ignoreerrors,
3750                 'ratelimit': opts.ratelimit,
3751                 'nooverwrites': opts.nooverwrites,
3752                 'retries': opts.retries,
3753                 'continuedl': opts.continue_dl,
3754                 'noprogress': opts.noprogress,
3755                 'playliststart': opts.playliststart,
3756                 'playlistend': opts.playlistend,
3757                 'logtostderr': opts.outtmpl == '-',
3758                 'consoletitle': opts.consoletitle,
3759                 'nopart': opts.nopart,
3760                 'updatetime': opts.updatetime,
3761                 'writedescription': opts.writedescription,
3762                 'writeinfojson': opts.writeinfojson,
3763                 'matchtitle': opts.matchtitle,
3764                 'rejecttitle': opts.rejecttitle,
3765                 })
3766         for extractor in extractors:
3767                 fd.add_info_extractor(extractor)
3768
3769         # PostProcessors
3770         if opts.extractaudio:
3771                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3772
3773         # Update version
3774         if opts.update_self:
3775                 updateSelf(fd, sys.argv[0])
3776
3777         # Maybe do nothing
3778         if len(all_urls) < 1:
3779                 if not opts.update_self:
3780                         parser.error(u'you must provide at least one URL')
3781                 else:
3782                         sys.exit()
3783         retcode = fd.download(all_urls)
3784
3785         # Dump cookie jar if requested
3786         if opts.cookiefile is not None:
3787                 try:
3788                         jar.save()
3789                 except (IOError, OSError), err:
3790                         sys.exit(u'ERROR: unable to save cookie jar')
3791
3792         sys.exit(retcode)
3793
3794
3795 if __name__ == '__main__':
3796         try:
3797                 main()
3798         except DownloadError:
3799                 sys.exit(1)
3800         except SameFileError:
3801                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3802         except KeyboardInterrupt:
3803                 sys.exit(u'\nERROR: Interrupted by user')
3804
3805 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: