7ac27b5a049dddc978c13c50c566e5886d85ab0d
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         )
15
16 __license__ = 'Public Domain'
17 __version__ = '2011.08.28-phihag'
18
19 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
20
21 import cookielib
22 import datetime
23 import gzip
24 import htmlentitydefs
25 import httplib
26 import locale
27 import math
28 import netrc
29 import os
30 import os.path
31 import re
32 import socket
33 import string
34 import subprocess
35 import sys
36 import time
37 import urllib
38 import urllib2
39 import warnings
40 import zlib
41
42 if os.name == 'nt':
43         import ctypes
44
45 try:
46         import email.utils
47 except ImportError: # Python 2.4
48         import email.Utils
49 try:
50         import cStringIO as StringIO
51 except ImportError:
52         import StringIO
53
54 # parse_qs was moved from the cgi module to the urlparse module recently.
55 try:
56         from urlparse import parse_qs
57 except ImportError:
58         from cgi import parse_qs
59
60 try:
61         import lxml.etree
62 except ImportError:
63         pass # Handled below
64
65 std_headers = {
66         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
67         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
68         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
69         'Accept-Encoding': 'gzip, deflate',
70         'Accept-Language': 'en-us,en;q=0.5',
71 }
72
73 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
74
75 try:
76         import json
77 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
78         import re
79         class json(object):
80                 @staticmethod
81                 def loads(s):
82                         s = s.decode('UTF-8')
83                         def raiseError(msg, i):
84                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
85                         def skipSpace(i, expectMore=True):
86                                 while i < len(s) and s[i] in ' \t\r\n':
87                                         i += 1
88                                 if expectMore:
89                                         if i >= len(s):
90                                                 raiseError('Premature end', i)
91                                 return i
92                         def decodeEscape(match):
93                                 esc = match.group(1)
94                                 _STATIC = {
95                                         '"': '"',
96                                         '\\': '\\',
97                                         '/': '/',
98                                         'b': unichr(0x8),
99                                         'f': unichr(0xc),
100                                         'n': '\n',
101                                         'r': '\r',
102                                         't': '\t',
103                                 }
104                                 if esc in _STATIC:
105                                         return _STATIC[esc]
106                                 if esc[0] == 'u':
107                                         if len(esc) == 1+4:
108                                                 return unichr(int(esc[1:5], 16))
109                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
110                                                 hi = int(esc[1:5], 16)
111                                                 low = int(esc[7:11], 16)
112                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
113                                 raise ValueError('Unknown escape ' + str(esc))
114                         def parseString(i):
115                                 i += 1
116                                 e = i
117                                 while True:
118                                         e = s.index('"', e)
119                                         bslashes = 0
120                                         while s[e-bslashes-1] == '\\':
121                                                 bslashes += 1
122                                         if bslashes % 2 == 1:
123                                                 e += 1
124                                                 continue
125                                         break
126                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
127                                 stri = rexp.sub(decodeEscape, s[i:e])
128                                 return (e+1,stri)
129                         def parseObj(i):
130                                 i += 1
131                                 res = {}
132                                 i = skipSpace(i)
133                                 if s[i] == '}': # Empty dictionary
134                                         return (i+1,res)
135                                 while True:
136                                         if s[i] != '"':
137                                                 raiseError('Expected a string object key', i)
138                                         i,key = parseString(i)
139                                         i = skipSpace(i)
140                                         if i >= len(s) or s[i] != ':':
141                                                 raiseError('Expected a colon', i)
142                                         i,val = parse(i+1)
143                                         res[key] = val
144                                         i = skipSpace(i)
145                                         if s[i] == '}':
146                                                 return (i+1, res)
147                                         if s[i] != ',':
148                                                 raiseError('Expected comma or closing curly brace', i)
149                                         i = skipSpace(i+1)
150                         def parseArray(i):
151                                 res = []
152                                 i = skipSpace(i+1)
153                                 if s[i] == ']': # Empty array
154                                         return (i+1,res)
155                                 while True:
156                                         i,val = parse(i)
157                                         res.append(val)
158                                         i = skipSpace(i) # Raise exception if premature end
159                                         if s[i] == ']':
160                                                 return (i+1, res)
161                                         if s[i] != ',':
162                                                 raiseError('Expected a comma or closing bracket', i)
163                                         i = skipSpace(i+1)
164                         def parseDiscrete(i):
165                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
166                                         if s.startswith(k, i):
167                                                 return (i+len(k), v)
168                                 raiseError('Not a boolean (or null)', i)
169                         def parseNumber(i):
170                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
171                                 if mobj is None:
172                                         raiseError('Not a number', i)
173                                 nums = mobj.group(1)
174                                 if '.' in nums or 'e' in nums or 'E' in nums:
175                                         return (i+len(nums), float(nums))
176                                 return (i+len(nums), int(nums))
177                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
178                         def parse(i):
179                                 i = skipSpace(i)
180                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
181                                 i = skipSpace(i, False)
182                                 return (i,res)
183                         i,res = parse(0)
184                         if i < len(s):
185                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
186                         return res
187
188 def preferredencoding():
189         """Get preferred encoding.
190
191         Returns the best encoding scheme for the system, based on
192         locale.getpreferredencoding() and some further tweaks.
193         """
194         def yield_preferredencoding():
195                 try:
196                         pref = locale.getpreferredencoding()
197                         u'TEST'.encode(pref)
198                 except:
199                         pref = 'UTF-8'
200                 while True:
201                         yield pref
202         return yield_preferredencoding().next()
203
204
205 def htmlentity_transform(matchobj):
206         """Transforms an HTML entity to a Unicode character.
207
208         This function receives a match object and is intended to be used with
209         the re.sub() function.
210         """
211         entity = matchobj.group(1)
212
213         # Known non-numeric HTML entity
214         if entity in htmlentitydefs.name2codepoint:
215                 return unichr(htmlentitydefs.name2codepoint[entity])
216
217         # Unicode character
218         mobj = re.match(ur'(?u)#(x?\d+)', entity)
219         if mobj is not None:
220                 numstr = mobj.group(1)
221                 if numstr.startswith(u'x'):
222                         base = 16
223                         numstr = u'0%s' % numstr
224                 else:
225                         base = 10
226                 return unichr(long(numstr, base))
227
228         # Unknown entity in name, return its literal representation
229         return (u'&%s;' % entity)
230
231
232 def sanitize_title(utitle):
233         """Sanitizes a video title so it could be used as part of a filename."""
234         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
235         return utitle.replace(unicode(os.sep), u'%')
236
237
238 def sanitize_open(filename, open_mode):
239         """Try to open the given filename, and slightly tweak it if this fails.
240
241         Attempts to open the given filename. If this fails, it tries to change
242         the filename slightly, step by step, until it's either able to open it
243         or it fails and raises a final exception, like the standard open()
244         function.
245
246         It returns the tuple (stream, definitive_file_name).
247         """
248         try:
249                 if filename == u'-':
250                         if sys.platform == 'win32':
251                                 import msvcrt
252                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
253                         return (sys.stdout, filename)
254                 stream = open(filename, open_mode)
255                 return (stream, filename)
256         except (IOError, OSError), err:
257                 # In case of error, try to remove win32 forbidden chars
258                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
259
260                 # An exception here should be caught in the caller
261                 stream = open(filename, open_mode)
262                 return (stream, filename)
263
264
265 def timeconvert(timestr):
266         """Convert RFC 2822 defined time string into system timestamp"""
267         timestamp = None
268         timetuple = email.utils.parsedate_tz(timestr)
269         if timetuple is not None:
270                 timestamp = email.utils.mktime_tz(timetuple)
271         return timestamp
272
273
274 class DownloadError(Exception):
275         """Download Error exception.
276
277         This exception may be thrown by FileDownloader objects if they are not
278         configured to continue on errors. They will contain the appropriate
279         error message.
280         """
281         pass
282
283
284 class SameFileError(Exception):
285         """Same File exception.
286
287         This exception will be thrown by FileDownloader objects if they detect
288         multiple files would have to be downloaded to the same file on disk.
289         """
290         pass
291
292
293 class PostProcessingError(Exception):
294         """Post Processing exception.
295
296         This exception may be raised by PostProcessor's .run() method to
297         indicate an error in the postprocessing task.
298         """
299         pass
300
301
302 class UnavailableVideoError(Exception):
303         """Unavailable Format exception.
304
305         This exception will be thrown when a video is requested
306         in a format that is not available for that video.
307         """
308         pass
309
310
311 class ContentTooShortError(Exception):
312         """Content Too Short exception.
313
314         This exception may be raised by FileDownloader objects when a file they
315         download is too small for what the server announced first, indicating
316         the connection was probably interrupted.
317         """
318         # Both in bytes
319         downloaded = None
320         expected = None
321
322         def __init__(self, downloaded, expected):
323                 self.downloaded = downloaded
324                 self.expected = expected
325
326
327 class YoutubeDLHandler(urllib2.HTTPHandler):
328         """Handler for HTTP requests and responses.
329
330         This class, when installed with an OpenerDirector, automatically adds
331         the standard headers to every HTTP request and handles gzipped and
332         deflated responses from web servers. If compression is to be avoided in
333         a particular request, the original request in the program code only has
334         to include the HTTP header "Youtubedl-No-Compression", which will be
335         removed before making the real request.
336
337         Part of this code was copied from:
338
339         http://techknack.net/python-urllib2-handlers/
340
341         Andrew Rowls, the author of that code, agreed to release it to the
342         public domain.
343         """
344
345         @staticmethod
346         def deflate(data):
347                 try:
348                         return zlib.decompress(data, -zlib.MAX_WBITS)
349                 except zlib.error:
350                         return zlib.decompress(data)
351
352         @staticmethod
353         def addinfourl_wrapper(stream, headers, url, code):
354                 if hasattr(urllib2.addinfourl, 'getcode'):
355                         return urllib2.addinfourl(stream, headers, url, code)
356                 ret = urllib2.addinfourl(stream, headers, url)
357                 ret.code = code
358                 return ret
359
360         def http_request(self, req):
361                 for h in std_headers:
362                         if h in req.headers:
363                                 del req.headers[h]
364                         req.add_header(h, std_headers[h])
365                 if 'Youtubedl-no-compression' in req.headers:
366                         if 'Accept-encoding' in req.headers:
367                                 del req.headers['Accept-encoding']
368                         del req.headers['Youtubedl-no-compression']
369                 return req
370
371         def http_response(self, req, resp):
372                 old_resp = resp
373                 # gzip
374                 if resp.headers.get('Content-encoding', '') == 'gzip':
375                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
376                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
377                         resp.msg = old_resp.msg
378                 # deflate
379                 if resp.headers.get('Content-encoding', '') == 'deflate':
380                         gz = StringIO.StringIO(self.deflate(resp.read()))
381                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
382                         resp.msg = old_resp.msg
383                 return resp
384
385
386 class FileDownloader(object):
387         """File Downloader class.
388
389         File downloader objects are the ones responsible of downloading the
390         actual video file and writing it to disk if the user has requested
391         it, among some other tasks. In most cases there should be one per
392         program. As, given a video URL, the downloader doesn't know how to
393         extract all the needed information, task that InfoExtractors do, it
394         has to pass the URL to one of them.
395
396         For this, file downloader objects have a method that allows
397         InfoExtractors to be registered in a given order. When it is passed
398         a URL, the file downloader handles it to the first InfoExtractor it
399         finds that reports being able to handle it. The InfoExtractor extracts
400         all the information about the video or videos the URL refers to, and
401         asks the FileDownloader to process the video information, possibly
402         downloading the video.
403
404         File downloaders accept a lot of parameters. In order not to saturate
405         the object constructor with arguments, it receives a dictionary of
406         options instead. These options are available through the params
407         attribute for the InfoExtractors to use. The FileDownloader also
408         registers itself as the downloader in charge for the InfoExtractors
409         that are added to it, so this is a "mutual registration".
410
411         Available options:
412
413         username:         Username for authentication purposes.
414         password:         Password for authentication purposes.
415         usenetrc:         Use netrc for authentication instead.
416         quiet:            Do not print messages to stdout.
417         forceurl:         Force printing final URL.
418         forcetitle:       Force printing title.
419         forcethumbnail:   Force printing thumbnail URL.
420         forcedescription: Force printing description.
421         forcefilename:    Force printing final filename.
422         simulate:         Do not download the video files.
423         format:           Video format code.
424         format_limit:     Highest quality format to try.
425         outtmpl:          Template for output names.
426         ignoreerrors:     Do not stop on download errors.
427         ratelimit:        Download speed limit, in bytes/sec.
428         nooverwrites:     Prevent overwriting files.
429         retries:          Number of times to retry for HTTP error 5xx
430         continuedl:       Try to continue downloads if possible.
431         noprogress:       Do not print the progress bar.
432         playliststart:    Playlist item to start at.
433         playlistend:      Playlist item to end at.
434         logtostderr:      Log messages to stderr instead of stdout.
435         consoletitle:     Display progress in console window's titlebar.
436         nopart:           Do not use temporary .part files.
437         updatetime:       Use the Last-modified header to set output file timestamps.
438         writedescription: Write the video description to a .description file
439         writeinfojson:    Write the video description to a .info.json file
440         """
441
442         params = None
443         _ies = []
444         _pps = []
445         _download_retcode = None
446         _num_downloads = None
447         _screen_file = None
448
449         def __init__(self, params):
450                 """Create a FileDownloader object with the given options."""
451                 self._ies = []
452                 self._pps = []
453                 self._download_retcode = 0
454                 self._num_downloads = 0
455                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
456                 self.params = params
457
458         @staticmethod
459         def pmkdir(filename):
460                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
461                 components = filename.split(os.sep)
462                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
463                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
464                 for dir in aggregate:
465                         if not os.path.exists(dir):
466                                 os.mkdir(dir)
467
468         @staticmethod
469         def format_bytes(bytes):
470                 if bytes is None:
471                         return 'N/A'
472                 if type(bytes) is str:
473                         bytes = float(bytes)
474                 if bytes == 0.0:
475                         exponent = 0
476                 else:
477                         exponent = long(math.log(bytes, 1024.0))
478                 suffix = 'bkMGTPEZY'[exponent]
479                 converted = float(bytes) / float(1024 ** exponent)
480                 return '%.2f%s' % (converted, suffix)
481
482         @staticmethod
483         def calc_percent(byte_counter, data_len):
484                 if data_len is None:
485                         return '---.-%'
486                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
487
488         @staticmethod
489         def calc_eta(start, now, total, current):
490                 if total is None:
491                         return '--:--'
492                 dif = now - start
493                 if current == 0 or dif < 0.001: # One millisecond
494                         return '--:--'
495                 rate = float(current) / dif
496                 eta = long((float(total) - float(current)) / rate)
497                 (eta_mins, eta_secs) = divmod(eta, 60)
498                 if eta_mins > 99:
499                         return '--:--'
500                 return '%02d:%02d' % (eta_mins, eta_secs)
501
502         @staticmethod
503         def calc_speed(start, now, bytes):
504                 dif = now - start
505                 if bytes == 0 or dif < 0.001: # One millisecond
506                         return '%10s' % '---b/s'
507                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
508
509         @staticmethod
510         def best_block_size(elapsed_time, bytes):
511                 new_min = max(bytes / 2.0, 1.0)
512                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
513                 if elapsed_time < 0.001:
514                         return long(new_max)
515                 rate = bytes / elapsed_time
516                 if rate > new_max:
517                         return long(new_max)
518                 if rate < new_min:
519                         return long(new_min)
520                 return long(rate)
521
522         @staticmethod
523         def parse_bytes(bytestr):
524                 """Parse a string indicating a byte quantity into a long integer."""
525                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
526                 if matchobj is None:
527                         return None
528                 number = float(matchobj.group(1))
529                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
530                 return long(round(number * multiplier))
531
532         def add_info_extractor(self, ie):
533                 """Add an InfoExtractor object to the end of the list."""
534                 self._ies.append(ie)
535                 ie.set_downloader(self)
536
537         def add_post_processor(self, pp):
538                 """Add a PostProcessor object to the end of the chain."""
539                 self._pps.append(pp)
540                 pp.set_downloader(self)
541
542         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
543                 """Print message to stdout if not in quiet mode."""
544                 try:
545                         if not self.params.get('quiet', False):
546                                 terminator = [u'\n', u''][skip_eol]
547                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
548                         self._screen_file.flush()
549                 except (UnicodeEncodeError), err:
550                         if not ignore_encoding_errors:
551                                 raise
552
553         def to_stderr(self, message):
554                 """Print message to stderr."""
555                 print >>sys.stderr, message.encode(preferredencoding())
556
557         def to_cons_title(self, message):
558                 """Set console/terminal window title to message."""
559                 if not self.params.get('consoletitle', False):
560                         return
561                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
562                         # c_wchar_p() might not be necessary if `message` is
563                         # already of type unicode()
564                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
565                 elif 'TERM' in os.environ:
566                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567
568         def fixed_template(self):
569                 """Checks if the output template is fixed."""
570                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571
572         def trouble(self, message=None):
573                 """Determine action to take when a download problem appears.
574
575                 Depending on if the downloader has been configured to ignore
576                 download errors or not, this method may throw an exception or
577                 not when errors are found, after printing the message.
578                 """
579                 if message is not None:
580                         self.to_stderr(message)
581                 if not self.params.get('ignoreerrors', False):
582                         raise DownloadError(message)
583                 self._download_retcode = 1
584
585         def slow_down(self, start_time, byte_counter):
586                 """Sleep if the download speed is over the rate limit."""
587                 rate_limit = self.params.get('ratelimit', None)
588                 if rate_limit is None or byte_counter == 0:
589                         return
590                 now = time.time()
591                 elapsed = now - start_time
592                 if elapsed <= 0.0:
593                         return
594                 speed = float(byte_counter) / elapsed
595                 if speed > rate_limit:
596                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597
598         def temp_name(self, filename):
599                 """Returns a temporary filename for the given filename."""
600                 if self.params.get('nopart', False) or filename == u'-' or \
601                                 (os.path.exists(filename) and not os.path.isfile(filename)):
602                         return filename
603                 return filename + u'.part'
604
605         def undo_temp_name(self, filename):
606                 if filename.endswith(u'.part'):
607                         return filename[:-len(u'.part')]
608                 return filename
609
610         def try_rename(self, old_filename, new_filename):
611                 try:
612                         if old_filename == new_filename:
613                                 return
614                         os.rename(old_filename, new_filename)
615                 except (IOError, OSError), err:
616                         self.trouble(u'ERROR: unable to rename file')
617
618         def try_utime(self, filename, last_modified_hdr):
619                 """Try to set the last-modified time of the given file."""
620                 if last_modified_hdr is None:
621                         return
622                 if not os.path.isfile(filename):
623                         return
624                 timestr = last_modified_hdr
625                 if timestr is None:
626                         return
627                 filetime = timeconvert(timestr)
628                 if filetime is None:
629                         return
630                 try:
631                         os.utime(filename, (time.time(), filetime))
632                 except:
633                         pass
634
635         def report_writedescription(self, descfn):
636                 """ Report that the description file is being written """
637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638
639         def report_writeinfojson(self, infofn):
640                 """ Report that the metadata file has been written """
641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
642
643         def report_destination(self, filename):
644                 """Report destination filename."""
645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
646
647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648                 """Report download progress."""
649                 if self.params.get('noprogress', False):
650                         return
651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
655
656         def report_resuming_byte(self, resume_len):
657                 """Report attempt to resume at given byte."""
658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
659
660         def report_retry(self, count, retries):
661                 """Report retry in case of HTTP error 5xx"""
662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
663
664         def report_file_already_downloaded(self, file_name):
665                 """Report file has already been fully downloaded."""
666                 try:
667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
668                 except (UnicodeEncodeError), err:
669                         self.to_screen(u'[download] The file has already been downloaded')
670
671         def report_unable_to_resume(self):
672                 """Report it was impossible to resume download."""
673                 self.to_screen(u'[download] Unable to resume')
674
675         def report_finish(self):
676                 """Report download finished."""
677                 if self.params.get('noprogress', False):
678                         self.to_screen(u'[download] Download completed')
679                 else:
680                         self.to_screen(u'')
681
682         def increment_downloads(self):
683                 """Increment the ordinal that assigns a number to each file."""
684                 self._num_downloads += 1
685
686         def prepare_filename(self, info_dict):
687                 """Generate the output filename."""
688                 try:
689                         template_dict = dict(info_dict)
690                         template_dict['epoch'] = unicode(long(time.time()))
691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692                         filename = self.params['outtmpl'] % template_dict
693                         return filename
694                 except (ValueError, KeyError), err:
695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
696                         return None
697
698         def process_info(self, info_dict):
699                 """Process a single dictionary returned by an InfoExtractor."""
700                 filename = self.prepare_filename(info_dict)
701                 # Do nothing else if in simulate mode
702                 if self.params.get('simulate', False):
703                         # Forced printings
704                         if self.params.get('forcetitle', False):
705                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
706                         if self.params.get('forceurl', False):
707                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
708                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
709                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
710                         if self.params.get('forcedescription', False) and 'description' in info_dict:
711                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
712                         if self.params.get('forcefilename', False) and filename is not None:
713                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
714
715                         return
716
717                 if filename is None:
718                         return
719                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
720                         self.to_stderr(u'WARNING: file exists and will be skipped')
721                         return
722
723                 try:
724                         self.pmkdir(filename)
725                 except (OSError, IOError), err:
726                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
727                         return
728
729                 if self.params.get('writedescription', False):
730                         try:
731                                 descfn = filename + '.description'
732                                 self.report_writedescription(descfn)
733                                 descfile = open(descfn, 'wb')
734                                 try:
735                                         descfile.write(info_dict['description'].encode('utf-8'))
736                                 finally:
737                                         descfile.close()
738                         except (OSError, IOError):
739                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
740                                 return
741
742                 if self.params.get('writeinfojson', False):
743                         infofn = filename + '.info.json'
744                         self.report_writeinfojson(infofn)
745                         try:
746                                 json.dump
747                         except (NameError,AttributeError):
748                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
749                                 return
750                         try:
751                                 infof = open(infofn, 'wb')
752                                 try:
753                                         json.dump(info_dict, infof)
754                                 finally:
755                                         infof.close()
756                         except (OSError, IOError):
757                                 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
758                                 return
759
760                 try:
761                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
762                 except (OSError, IOError), err:
763                         raise UnavailableVideoError
764                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
765                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
766                         return
767                 except (ContentTooShortError, ), err:
768                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
769                         return
770
771                 if success:
772                         try:
773                                 self.post_process(filename, info_dict)
774                         except (PostProcessingError), err:
775                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
776                                 return
777
778         def download(self, url_list):
779                 """Download a given list of URLs."""
780                 if len(url_list) > 1 and self.fixed_template():
781                         raise SameFileError(self.params['outtmpl'])
782
783                 for url in url_list:
784                         suitable_found = False
785                         for ie in self._ies:
786                                 # Go to next InfoExtractor if not suitable
787                                 if not ie.suitable(url):
788                                         continue
789
790                                 # Suitable InfoExtractor found
791                                 suitable_found = True
792
793                                 # Extract information from URL and process it
794                                 ie.extract(url)
795
796                                 # Suitable InfoExtractor had been found; go to next URL
797                                 break
798
799                         if not suitable_found:
800                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
801
802                 return self._download_retcode
803
804         def post_process(self, filename, ie_info):
805                 """Run the postprocessing chain on the given file."""
806                 info = dict(ie_info)
807                 info['filepath'] = filename
808                 for pp in self._pps:
809                         info = pp.run(info)
810                         if info is None:
811                                 break
812
813         def _download_with_rtmpdump(self, filename, url, player_url):
814                 self.report_destination(filename)
815                 tmpfilename = self.temp_name(filename)
816
817                 # Check for rtmpdump first
818                 try:
819                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
820                 except (OSError, IOError):
821                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
822                         return False
823
824                 # Download using rtmpdump. rtmpdump returns exit code 2 when
825                 # the connection was interrumpted and resuming appears to be
826                 # possible. This is part of rtmpdump's normal usage, AFAIK.
827                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
828                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
829                 while retval == 2 or retval == 1:
830                         prevsize = os.path.getsize(tmpfilename)
831                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
832                         time.sleep(5.0) # This seems to be needed
833                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
834                         cursize = os.path.getsize(tmpfilename)
835                         if prevsize == cursize and retval == 1:
836                                 break
837                 if retval == 0:
838                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
839                         self.try_rename(tmpfilename, filename)
840                         return True
841                 else:
842                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
843                         return False
844
845         def _do_download(self, filename, url, player_url):
846                 # Check file already present
847                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
848                         self.report_file_already_downloaded(filename)
849                         return True
850
851                 # Attempt to download using rtmpdump
852                 if url.startswith('rtmp'):
853                         return self._download_with_rtmpdump(filename, url, player_url)
854
855                 tmpfilename = self.temp_name(filename)
856                 stream = None
857                 open_mode = 'wb'
858
859                 # Do not include the Accept-Encoding header
860                 headers = {'Youtubedl-no-compression': 'True'}
861                 basic_request = urllib2.Request(url, None, headers)
862                 request = urllib2.Request(url, None, headers)
863
864                 # Establish possible resume length
865                 if os.path.isfile(tmpfilename):
866                         resume_len = os.path.getsize(tmpfilename)
867                 else:
868                         resume_len = 0
869
870                 # Request parameters in case of being able to resume
871                 if self.params.get('continuedl', False) and resume_len != 0:
872                         self.report_resuming_byte(resume_len)
873                         request.add_header('Range', 'bytes=%d-' % resume_len)
874                         open_mode = 'ab'
875
876                 count = 0
877                 retries = self.params.get('retries', 0)
878                 while count <= retries:
879                         # Establish connection
880                         try:
881                                 data = urllib2.urlopen(request)
882                                 break
883                         except (urllib2.HTTPError, ), err:
884                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
885                                         # Unexpected HTTP error
886                                         raise
887                                 elif err.code == 416:
888                                         # Unable to resume (requested range not satisfiable)
889                                         try:
890                                                 # Open the connection again without the range header
891                                                 data = urllib2.urlopen(basic_request)
892                                                 content_length = data.info()['Content-Length']
893                                         except (urllib2.HTTPError, ), err:
894                                                 if err.code < 500 or err.code >= 600:
895                                                         raise
896                                         else:
897                                                 # Examine the reported length
898                                                 if (content_length is not None and
899                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
900                                                         # The file had already been fully downloaded.
901                                                         # Explanation to the above condition: in issue #175 it was revealed that
902                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
903                                                         # changing the file size slightly and causing problems for some users. So
904                                                         # I decided to implement a suggested change and consider the file
905                                                         # completely downloaded if the file size differs less than 100 bytes from
906                                                         # the one in the hard drive.
907                                                         self.report_file_already_downloaded(filename)
908                                                         self.try_rename(tmpfilename, filename)
909                                                         return True
910                                                 else:
911                                                         # The length does not match, we start the download over
912                                                         self.report_unable_to_resume()
913                                                         open_mode = 'wb'
914                                                         break
915                         # Retry
916                         count += 1
917                         if count <= retries:
918                                 self.report_retry(count, retries)
919
920                 if count > retries:
921                         self.trouble(u'ERROR: giving up after %s retries' % retries)
922                         return False
923
924                 data_len = data.info().get('Content-length', None)
925                 if data_len is not None:
926                         data_len = long(data_len) + resume_len
927                 data_len_str = self.format_bytes(data_len)
928                 byte_counter = 0 + resume_len
929                 block_size = 1024
930                 start = time.time()
931                 while True:
932                         # Download and write
933                         before = time.time()
934                         data_block = data.read(block_size)
935                         after = time.time()
936                         if len(data_block) == 0:
937                                 break
938                         byte_counter += len(data_block)
939
940                         # Open file just in time
941                         if stream is None:
942                                 try:
943                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
944                                         assert stream is not None
945                                         filename = self.undo_temp_name(tmpfilename)
946                                         self.report_destination(filename)
947                                 except (OSError, IOError), err:
948                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
949                                         return False
950                         try:
951                                 stream.write(data_block)
952                         except (IOError, OSError), err:
953                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
954                                 return False
955                         block_size = self.best_block_size(after - before, len(data_block))
956
957                         # Progress message
958                         percent_str = self.calc_percent(byte_counter, data_len)
959                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
960                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
961                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
962
963                         # Apply rate limit
964                         self.slow_down(start, byte_counter - resume_len)
965
966                 if stream is None:
967                         self.trouble(u'\nERROR: Did not get any data blocks')
968                         return False
969                 stream.close()
970                 self.report_finish()
971                 if data_len is not None and byte_counter != data_len:
972                         raise ContentTooShortError(byte_counter, long(data_len))
973                 self.try_rename(tmpfilename, filename)
974
975                 # Update file modification time
976                 if self.params.get('updatetime', True):
977                         self.try_utime(filename, data.info().get('last-modified', None))
978
979                 return True
980
981
982 class InfoExtractor(object):
983         """Information Extractor class.
984
985         Information extractors are the classes that, given a URL, extract
986         information from the video (or videos) the URL refers to. This
987         information includes the real video URL, the video title and simplified
988         title, author and others. The information is stored in a dictionary
989         which is then passed to the FileDownloader. The FileDownloader
990         processes this information possibly downloading the video to the file
991         system, among other possible outcomes. The dictionaries must include
992         the following fields:
993
994         id:             Video identifier.
995         url:            Final video URL.
996         uploader:       Nickname of the video uploader.
997         title:          Literal title.
998         stitle:         Simplified title.
999         ext:            Video filename extension.
1000         format:         Video format.
1001         player_url:     SWF Player URL (may be None).
1002
1003         The following fields are optional. Their primary purpose is to allow
1004         youtube-dl to serve as the backend for a video search function, such
1005         as the one in youtube2mp3.  They are only used when their respective
1006         forced printing functions are called:
1007
1008         thumbnail:      Full URL to a video thumbnail image.
1009         description:    One-line video description.
1010
1011         Subclasses of this one should re-define the _real_initialize() and
1012         _real_extract() methods, as well as the suitable() static method.
1013         Probably, they should also be instantiated and added to the main
1014         downloader.
1015         """
1016
1017         _ready = False
1018         _downloader = None
1019
1020         def __init__(self, downloader=None):
1021                 """Constructor. Receives an optional downloader."""
1022                 self._ready = False
1023                 self.set_downloader(downloader)
1024
1025         @staticmethod
1026         def suitable(url):
1027                 """Receives a URL and returns True if suitable for this IE."""
1028                 return False
1029
1030         def initialize(self):
1031                 """Initializes an instance (authentication, etc)."""
1032                 if not self._ready:
1033                         self._real_initialize()
1034                         self._ready = True
1035
1036         def extract(self, url):
1037                 """Extracts URL information and returns it in list of dicts."""
1038                 self.initialize()
1039                 return self._real_extract(url)
1040
1041         def set_downloader(self, downloader):
1042                 """Sets the downloader for this IE."""
1043                 self._downloader = downloader
1044
1045         def _real_initialize(self):
1046                 """Real initialization process. Redefine in subclasses."""
1047                 pass
1048
1049         def _real_extract(self, url):
1050                 """Real extraction process. Redefine in subclasses."""
1051                 pass
1052
1053
1054 class YoutubeIE(InfoExtractor):
1055         """Information extractor for youtube.com."""
1056
1057         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1058         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1059         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1060         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1061         _NETRC_MACHINE = 'youtube'
1062         # Listed in order of quality
1063         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1064         _video_extensions = {
1065                 '13': '3gp',
1066                 '17': 'mp4',
1067                 '18': 'mp4',
1068                 '22': 'mp4',
1069                 '37': 'mp4',
1070                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1071                 '43': 'webm',
1072                 '45': 'webm',
1073         }
1074
1075         @staticmethod
1076         def suitable(url):
1077                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1078
1079         def report_lang(self):
1080                 """Report attempt to set language."""
1081                 self._downloader.to_screen(u'[youtube] Setting language')
1082
1083         def report_login(self):
1084                 """Report attempt to log in."""
1085                 self._downloader.to_screen(u'[youtube] Logging in')
1086
1087         def report_age_confirmation(self):
1088                 """Report attempt to confirm age."""
1089                 self._downloader.to_screen(u'[youtube] Confirming age')
1090
1091         def report_video_webpage_download(self, video_id):
1092                 """Report attempt to download video webpage."""
1093                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1094
1095         def report_video_info_webpage_download(self, video_id):
1096                 """Report attempt to download video info webpage."""
1097                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1098
1099         def report_information_extraction(self, video_id):
1100                 """Report attempt to extract video information."""
1101                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1102
1103         def report_unavailable_format(self, video_id, format):
1104                 """Report extracted video URL."""
1105                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1106
1107         def report_rtmp_download(self):
1108                 """Indicate the download will use the RTMP protocol."""
1109                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1110
1111         def _real_initialize(self):
1112                 if self._downloader is None:
1113                         return
1114
1115                 username = None
1116                 password = None
1117                 downloader_params = self._downloader.params
1118
1119                 # Attempt to use provided username and password or .netrc data
1120                 if downloader_params.get('username', None) is not None:
1121                         username = downloader_params['username']
1122                         password = downloader_params['password']
1123                 elif downloader_params.get('usenetrc', False):
1124                         try:
1125                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1126                                 if info is not None:
1127                                         username = info[0]
1128                                         password = info[2]
1129                                 else:
1130                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1131                         except (IOError, netrc.NetrcParseError), err:
1132                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1133                                 return
1134
1135                 # Set language
1136                 request = urllib2.Request(self._LANG_URL)
1137                 try:
1138                         self.report_lang()
1139                         urllib2.urlopen(request).read()
1140                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1141                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1142                         return
1143
1144                 # No authentication to be performed
1145                 if username is None:
1146                         return
1147
1148                 # Log in
1149                 login_form = {
1150                                 'current_form': 'loginForm',
1151                                 'next':         '/',
1152                                 'action_login': 'Log In',
1153                                 'username':     username,
1154                                 'password':     password,
1155                                 }
1156                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1157                 try:
1158                         self.report_login()
1159                         login_results = urllib2.urlopen(request).read()
1160                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1161                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1162                                 return
1163                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1164                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1165                         return
1166
1167                 # Confirm age
1168                 age_form = {
1169                                 'next_url':             '/',
1170                                 'action_confirm':       'Confirm',
1171                                 }
1172                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1173                 try:
1174                         self.report_age_confirmation()
1175                         age_results = urllib2.urlopen(request).read()
1176                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1177                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1178                         return
1179
1180         def _real_extract(self, url):
1181                 # Extract video id from URL
1182                 mobj = re.match(self._VALID_URL, url)
1183                 if mobj is None:
1184                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1185                         return
1186                 video_id = mobj.group(2)
1187
1188                 # Get video webpage
1189                 self.report_video_webpage_download(video_id)
1190                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1191                 try:
1192                         video_webpage = urllib2.urlopen(request).read()
1193                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1195                         return
1196
1197                 # Attempt to extract SWF player URL
1198                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1199                 if mobj is not None:
1200                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1201                 else:
1202                         player_url = None
1203
1204                 # Get video info
1205                 self.report_video_info_webpage_download(video_id)
1206                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1207                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1208                                         % (video_id, el_type))
1209                         request = urllib2.Request(video_info_url)
1210                         try:
1211                                 video_info_webpage = urllib2.urlopen(request).read()
1212                                 video_info = parse_qs(video_info_webpage)
1213                                 if 'token' in video_info:
1214                                         break
1215                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1216                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1217                                 return
1218                 if 'token' not in video_info:
1219                         if 'reason' in video_info:
1220                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1221                         else:
1222                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1223                         return
1224
1225                 # Start extracting information
1226                 self.report_information_extraction(video_id)
1227
1228                 # uploader
1229                 if 'author' not in video_info:
1230                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1231                         return
1232                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1233
1234                 # title
1235                 if 'title' not in video_info:
1236                         self._downloader.trouble(u'ERROR: unable to extract video title')
1237                         return
1238                 video_title = urllib.unquote_plus(video_info['title'][0])
1239                 video_title = video_title.decode('utf-8')
1240                 video_title = sanitize_title(video_title)
1241
1242                 # simplified title
1243                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1244                 simple_title = simple_title.strip(ur'_')
1245
1246                 # thumbnail image
1247                 if 'thumbnail_url' not in video_info:
1248                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1249                         video_thumbnail = ''
1250                 else:   # don't panic if we can't find it
1251                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1252
1253                 # upload date
1254                 upload_date = u'NA'
1255                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1256                 if mobj is not None:
1257                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1258                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1259                         for expression in format_expressions:
1260                                 try:
1261                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1262                                 except:
1263                                         pass
1264
1265                 # description
1266                 try:
1267                         lxml.etree
1268                 except NameError:
1269                         video_description = u'No description available.'
1270                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1271                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1272                                 if mobj is not None:
1273                                         video_description = mobj.group(1).decode('utf-8')
1274                 else:
1275                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1276                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1277                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1278                         # TODO use another parser
1279
1280                 # token
1281                 video_token = urllib.unquote_plus(video_info['token'][0])
1282
1283                 # Decide which formats to download
1284                 req_format = self._downloader.params.get('format', None)
1285
1286                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1287                         self.report_rtmp_download()
1288                         video_url_list = [(None, video_info['conn'][0])]
1289                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1290                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1291                         url_data = [parse_qs(uds) for uds in url_data_strs]
1292                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1293                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1294
1295                         format_limit = self._downloader.params.get('format_limit', None)
1296                         if format_limit is not None and format_limit in self._available_formats:
1297                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1298                         else:
1299                                 format_list = self._available_formats
1300                         existing_formats = [x for x in format_list if x in url_map]
1301                         if len(existing_formats) == 0:
1302                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1303                                 return
1304                         if req_format is None:
1305                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1306                         elif req_format == '-1':
1307                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1308                         else:
1309                                 # Specific format
1310                                 if req_format not in url_map:
1311                                         self._downloader.trouble(u'ERROR: requested format not available')
1312                                         return
1313                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1314                 else:
1315                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1316                         return
1317
1318                 for format_param, video_real_url in video_url_list:
1319                         # At this point we have a new video
1320                         self._downloader.increment_downloads()
1321
1322                         # Extension
1323                         video_extension = self._video_extensions.get(format_param, 'flv')
1324
1325                         try:
1326                                 # Process video information
1327                                 self._downloader.process_info({
1328                                         'id':           video_id.decode('utf-8'),
1329                                         'url':          video_real_url.decode('utf-8'),
1330                                         'uploader':     video_uploader.decode('utf-8'),
1331                                         'upload_date':  upload_date,
1332                                         'title':        video_title,
1333                                         'stitle':       simple_title,
1334                                         'ext':          video_extension.decode('utf-8'),
1335                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1336                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1337                                         'description':  video_description,
1338                                         'player_url':   player_url,
1339                                 })
1340                         except UnavailableVideoError, err:
1341                                 self._downloader.trouble(u'\nERROR: unable to download video')
1342
1343
1344 class MetacafeIE(InfoExtractor):
1345         """Information Extractor for metacafe.com."""
1346
1347         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1348         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1349         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1350         _youtube_ie = None
1351
1352         def __init__(self, youtube_ie, downloader=None):
1353                 InfoExtractor.__init__(self, downloader)
1354                 self._youtube_ie = youtube_ie
1355
1356         @staticmethod
1357         def suitable(url):
1358                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1359
1360         def report_disclaimer(self):
1361                 """Report disclaimer retrieval."""
1362                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1363
1364         def report_age_confirmation(self):
1365                 """Report attempt to confirm age."""
1366                 self._downloader.to_screen(u'[metacafe] Confirming age')
1367
1368         def report_download_webpage(self, video_id):
1369                 """Report webpage download."""
1370                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1371
1372         def report_extraction(self, video_id):
1373                 """Report information extraction."""
1374                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1375
1376         def _real_initialize(self):
1377                 # Retrieve disclaimer
1378                 request = urllib2.Request(self._DISCLAIMER)
1379                 try:
1380                         self.report_disclaimer()
1381                         disclaimer = urllib2.urlopen(request).read()
1382                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1383                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1384                         return
1385
1386                 # Confirm age
1387                 disclaimer_form = {
1388                         'filters': '0',
1389                         'submit': "Continue - I'm over 18",
1390                         }
1391                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1392                 try:
1393                         self.report_age_confirmation()
1394                         disclaimer = urllib2.urlopen(request).read()
1395                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1396                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1397                         return
1398
1399         def _real_extract(self, url):
1400                 # Extract id and simplified title from URL
1401                 mobj = re.match(self._VALID_URL, url)
1402                 if mobj is None:
1403                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1404                         return
1405
1406                 video_id = mobj.group(1)
1407
1408                 # Check if video comes from YouTube
1409                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1410                 if mobj2 is not None:
1411                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1412                         return
1413
1414                 # At this point we have a new video
1415                 self._downloader.increment_downloads()
1416
1417                 simple_title = mobj.group(2).decode('utf-8')
1418
1419                 # Retrieve video webpage to extract further information
1420                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1421                 try:
1422                         self.report_download_webpage(video_id)
1423                         webpage = urllib2.urlopen(request).read()
1424                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1425                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1426                         return
1427
1428                 # Extract URL, uploader and title from webpage
1429                 self.report_extraction(video_id)
1430                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1431                 if mobj is not None:
1432                         mediaURL = urllib.unquote(mobj.group(1))
1433                         video_extension = mediaURL[-3:]
1434
1435                         # Extract gdaKey if available
1436                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1437                         if mobj is None:
1438                                 video_url = mediaURL
1439                         else:
1440                                 gdaKey = mobj.group(1)
1441                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1442                 else:
1443                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1444                         if mobj is None:
1445                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1446                                 return
1447                         vardict = parse_qs(mobj.group(1))
1448                         if 'mediaData' not in vardict:
1449                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1450                                 return
1451                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1452                         if mobj is None:
1453                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1454                                 return
1455                         mediaURL = mobj.group(1).replace('\\/', '/')
1456                         video_extension = mediaURL[-3:]
1457                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1458
1459                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1460                 if mobj is None:
1461                         self._downloader.trouble(u'ERROR: unable to extract title')
1462                         return
1463                 video_title = mobj.group(1).decode('utf-8')
1464                 video_title = sanitize_title(video_title)
1465
1466                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1467                 if mobj is None:
1468                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1469                         return
1470                 video_uploader = mobj.group(1)
1471
1472                 try:
1473                         # Process video information
1474                         self._downloader.process_info({
1475                                 'id':           video_id.decode('utf-8'),
1476                                 'url':          video_url.decode('utf-8'),
1477                                 'uploader':     video_uploader.decode('utf-8'),
1478                                 'upload_date':  u'NA',
1479                                 'title':        video_title,
1480                                 'stitle':       simple_title,
1481                                 'ext':          video_extension.decode('utf-8'),
1482                                 'format':       u'NA',
1483                                 'player_url':   None,
1484                         })
1485                 except UnavailableVideoError:
1486                         self._downloader.trouble(u'\nERROR: unable to download video')
1487
1488
1489 class DailymotionIE(InfoExtractor):
1490         """Information Extractor for Dailymotion"""
1491
1492         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1493
1494         def __init__(self, downloader=None):
1495                 InfoExtractor.__init__(self, downloader)
1496
1497         @staticmethod
1498         def suitable(url):
1499                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1500
1501         def report_download_webpage(self, video_id):
1502                 """Report webpage download."""
1503                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1504
1505         def report_extraction(self, video_id):
1506                 """Report information extraction."""
1507                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1508
1509         def _real_initialize(self):
1510                 return
1511
1512         def _real_extract(self, url):
1513                 # Extract id and simplified title from URL
1514                 mobj = re.match(self._VALID_URL, url)
1515                 if mobj is None:
1516                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1517                         return
1518
1519                 # At this point we have a new video
1520                 self._downloader.increment_downloads()
1521                 video_id = mobj.group(1)
1522
1523                 simple_title = mobj.group(2).decode('utf-8')
1524                 video_extension = 'flv'
1525
1526                 # Retrieve video webpage to extract further information
1527                 request = urllib2.Request(url)
1528                 try:
1529                         self.report_download_webpage(video_id)
1530                         webpage = urllib2.urlopen(request).read()
1531                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1532                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1533                         return
1534
1535                 # Extract URL, uploader and title from webpage
1536                 self.report_extraction(video_id)
1537                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1538                 if mobj is None:
1539                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1540                         return
1541                 mediaURL = urllib.unquote(mobj.group(1))
1542
1543                 # if needed add http://www.dailymotion.com/ if relative URL
1544
1545                 video_url = mediaURL
1546
1547                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1548                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1549                 if mobj is None:
1550                         self._downloader.trouble(u'ERROR: unable to extract title')
1551                         return
1552                 video_title = mobj.group(1).decode('utf-8')
1553                 video_title = sanitize_title(video_title)
1554
1555                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1556                 if mobj is None:
1557                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1558                         return
1559                 video_uploader = mobj.group(1)
1560
1561                 try:
1562                         # Process video information
1563                         self._downloader.process_info({
1564                                 'id':           video_id.decode('utf-8'),
1565                                 'url':          video_url.decode('utf-8'),
1566                                 'uploader':     video_uploader.decode('utf-8'),
1567                                 'upload_date':  u'NA',
1568                                 'title':        video_title,
1569                                 'stitle':       simple_title,
1570                                 'ext':          video_extension.decode('utf-8'),
1571                                 'format':       u'NA',
1572                                 'player_url':   None,
1573                         })
1574                 except UnavailableVideoError:
1575                         self._downloader.trouble(u'\nERROR: unable to download video')
1576
1577
1578 class GoogleIE(InfoExtractor):
1579         """Information extractor for video.google.com."""
1580
1581         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1582
1583         def __init__(self, downloader=None):
1584                 InfoExtractor.__init__(self, downloader)
1585
1586         @staticmethod
1587         def suitable(url):
1588                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1589
1590         def report_download_webpage(self, video_id):
1591                 """Report webpage download."""
1592                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1593
1594         def report_extraction(self, video_id):
1595                 """Report information extraction."""
1596                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1597
1598         def _real_initialize(self):
1599                 return
1600
1601         def _real_extract(self, url):
1602                 # Extract id from URL
1603                 mobj = re.match(self._VALID_URL, url)
1604                 if mobj is None:
1605                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1606                         return
1607
1608                 # At this point we have a new video
1609                 self._downloader.increment_downloads()
1610                 video_id = mobj.group(1)
1611
1612                 video_extension = 'mp4'
1613
1614                 # Retrieve video webpage to extract further information
1615                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1616                 try:
1617                         self.report_download_webpage(video_id)
1618                         webpage = urllib2.urlopen(request).read()
1619                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1620                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1621                         return
1622
1623                 # Extract URL, uploader, and title from webpage
1624                 self.report_extraction(video_id)
1625                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1626                 if mobj is None:
1627                         video_extension = 'flv'
1628                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1629                 if mobj is None:
1630                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1631                         return
1632                 mediaURL = urllib.unquote(mobj.group(1))
1633                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1634                 mediaURL = mediaURL.replace('\\x26', '\x26')
1635
1636                 video_url = mediaURL
1637
1638                 mobj = re.search(r'<title>(.*)</title>', webpage)
1639                 if mobj is None:
1640                         self._downloader.trouble(u'ERROR: unable to extract title')
1641                         return
1642                 video_title = mobj.group(1).decode('utf-8')
1643                 video_title = sanitize_title(video_title)
1644                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1645
1646                 # Extract video description
1647                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1648                 if mobj is None:
1649                         self._downloader.trouble(u'ERROR: unable to extract video description')
1650                         return
1651                 video_description = mobj.group(1).decode('utf-8')
1652                 if not video_description:
1653                         video_description = 'No description available.'
1654
1655                 # Extract video thumbnail
1656                 if self._downloader.params.get('forcethumbnail', False):
1657                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1658                         try:
1659                                 webpage = urllib2.urlopen(request).read()
1660                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1661                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1662                                 return
1663                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1664                         if mobj is None:
1665                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1666                                 return
1667                         video_thumbnail = mobj.group(1)
1668                 else:   # we need something to pass to process_info
1669                         video_thumbnail = ''
1670
1671                 try:
1672                         # Process video information
1673                         self._downloader.process_info({
1674                                 'id':           video_id.decode('utf-8'),
1675                                 'url':          video_url.decode('utf-8'),
1676                                 'uploader':     u'NA',
1677                                 'upload_date':  u'NA',
1678                                 'title':        video_title,
1679                                 'stitle':       simple_title,
1680                                 'ext':          video_extension.decode('utf-8'),
1681                                 'format':       u'NA',
1682                                 'player_url':   None,
1683                         })
1684                 except UnavailableVideoError:
1685                         self._downloader.trouble(u'\nERROR: unable to download video')
1686
1687
1688 class PhotobucketIE(InfoExtractor):
1689         """Information extractor for photobucket.com."""
1690
1691         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1692
1693         def __init__(self, downloader=None):
1694                 InfoExtractor.__init__(self, downloader)
1695
1696         @staticmethod
1697         def suitable(url):
1698                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1699
1700         def report_download_webpage(self, video_id):
1701                 """Report webpage download."""
1702                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1703
1704         def report_extraction(self, video_id):
1705                 """Report information extraction."""
1706                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1707
1708         def _real_initialize(self):
1709                 return
1710
1711         def _real_extract(self, url):
1712                 # Extract id from URL
1713                 mobj = re.match(self._VALID_URL, url)
1714                 if mobj is None:
1715                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1716                         return
1717
1718                 # At this point we have a new video
1719                 self._downloader.increment_downloads()
1720                 video_id = mobj.group(1)
1721
1722                 video_extension = 'flv'
1723
1724                 # Retrieve video webpage to extract further information
1725                 request = urllib2.Request(url)
1726                 try:
1727                         self.report_download_webpage(video_id)
1728                         webpage = urllib2.urlopen(request).read()
1729                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1730                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1731                         return
1732
1733                 # Extract URL, uploader, and title from webpage
1734                 self.report_extraction(video_id)
1735                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1736                 if mobj is None:
1737                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1738                         return
1739                 mediaURL = urllib.unquote(mobj.group(1))
1740
1741                 video_url = mediaURL
1742
1743                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1744                 if mobj is None:
1745                         self._downloader.trouble(u'ERROR: unable to extract title')
1746                         return
1747                 video_title = mobj.group(1).decode('utf-8')
1748                 video_title = sanitize_title(video_title)
1749                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1750
1751                 video_uploader = mobj.group(2).decode('utf-8')
1752
1753                 try:
1754                         # Process video information
1755                         self._downloader.process_info({
1756                                 'id':           video_id.decode('utf-8'),
1757                                 'url':          video_url.decode('utf-8'),
1758                                 'uploader':     video_uploader,
1759                                 'upload_date':  u'NA',
1760                                 'title':        video_title,
1761                                 'stitle':       simple_title,
1762                                 'ext':          video_extension.decode('utf-8'),
1763                                 'format':       u'NA',
1764                                 'player_url':   None,
1765                         })
1766                 except UnavailableVideoError:
1767                         self._downloader.trouble(u'\nERROR: unable to download video')
1768
1769
1770 class YahooIE(InfoExtractor):
1771         """Information extractor for video.yahoo.com."""
1772
1773         # _VALID_URL matches all Yahoo! Video URLs
1774         # _VPAGE_URL matches only the extractable '/watch/' URLs
1775         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1776         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1777
1778         def __init__(self, downloader=None):
1779                 InfoExtractor.__init__(self, downloader)
1780
1781         @staticmethod
1782         def suitable(url):
1783                 return (re.match(YahooIE._VALID_URL, url) is not None)
1784
1785         def report_download_webpage(self, video_id):
1786                 """Report webpage download."""
1787                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1788
1789         def report_extraction(self, video_id):
1790                 """Report information extraction."""
1791                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1792
1793         def _real_initialize(self):
1794                 return
1795
1796         def _real_extract(self, url, new_video=True):
1797                 # Extract ID from URL
1798                 mobj = re.match(self._VALID_URL, url)
1799                 if mobj is None:
1800                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1801                         return
1802
1803                 # At this point we have a new video
1804                 self._downloader.increment_downloads()
1805                 video_id = mobj.group(2)
1806                 video_extension = 'flv'
1807
1808                 # Rewrite valid but non-extractable URLs as
1809                 # extractable English language /watch/ URLs
1810                 if re.match(self._VPAGE_URL, url) is None:
1811                         request = urllib2.Request(url)
1812                         try:
1813                                 webpage = urllib2.urlopen(request).read()
1814                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1815                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1816                                 return
1817
1818                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1819                         if mobj is None:
1820                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1821                                 return
1822                         yahoo_id = mobj.group(1)
1823
1824                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1825                         if mobj is None:
1826                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1827                                 return
1828                         yahoo_vid = mobj.group(1)
1829
1830                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1831                         return self._real_extract(url, new_video=False)
1832
1833                 # Retrieve video webpage to extract further information
1834                 request = urllib2.Request(url)
1835                 try:
1836                         self.report_download_webpage(video_id)
1837                         webpage = urllib2.urlopen(request).read()
1838                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1839                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1840                         return
1841
1842                 # Extract uploader and title from webpage
1843                 self.report_extraction(video_id)
1844                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1845                 if mobj is None:
1846                         self._downloader.trouble(u'ERROR: unable to extract video title')
1847                         return
1848                 video_title = mobj.group(1).decode('utf-8')
1849                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1850
1851                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1852                 if mobj is None:
1853                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1854                         return
1855                 video_uploader = mobj.group(1).decode('utf-8')
1856
1857                 # Extract video thumbnail
1858                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1859                 if mobj is None:
1860                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1861                         return
1862                 video_thumbnail = mobj.group(1).decode('utf-8')
1863
1864                 # Extract video description
1865                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1866                 if mobj is None:
1867                         self._downloader.trouble(u'ERROR: unable to extract video description')
1868                         return
1869                 video_description = mobj.group(1).decode('utf-8')
1870                 if not video_description:
1871                         video_description = 'No description available.'
1872
1873                 # Extract video height and width
1874                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1875                 if mobj is None:
1876                         self._downloader.trouble(u'ERROR: unable to extract video height')
1877                         return
1878                 yv_video_height = mobj.group(1)
1879
1880                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1881                 if mobj is None:
1882                         self._downloader.trouble(u'ERROR: unable to extract video width')
1883                         return
1884                 yv_video_width = mobj.group(1)
1885
1886                 # Retrieve video playlist to extract media URL
1887                 # I'm not completely sure what all these options are, but we
1888                 # seem to need most of them, otherwise the server sends a 401.
1889                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1890                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1891                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1892                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1893                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1894                 try:
1895                         self.report_download_webpage(video_id)
1896                         webpage = urllib2.urlopen(request).read()
1897                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1898                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1899                         return
1900
1901                 # Extract media URL from playlist XML
1902                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1903                 if mobj is None:
1904                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1905                         return
1906                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1907                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1908
1909                 try:
1910                         # Process video information
1911                         self._downloader.process_info({
1912                                 'id':           video_id.decode('utf-8'),
1913                                 'url':          video_url,
1914                                 'uploader':     video_uploader,
1915                                 'upload_date':  u'NA',
1916                                 'title':        video_title,
1917                                 'stitle':       simple_title,
1918                                 'ext':          video_extension.decode('utf-8'),
1919                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1920                                 'description':  video_description,
1921                                 'thumbnail':    video_thumbnail,
1922                                 'description':  video_description,
1923                                 'player_url':   None,
1924                         })
1925                 except UnavailableVideoError:
1926                         self._downloader.trouble(u'\nERROR: unable to download video')
1927
1928
1929 class VimeoIE(InfoExtractor):
1930         """Information extractor for vimeo.com."""
1931
1932         # _VALID_URL matches Vimeo URLs
1933         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1934
1935         def __init__(self, downloader=None):
1936                 InfoExtractor.__init__(self, downloader)
1937
1938         @staticmethod
1939         def suitable(url):
1940                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1941
1942         def report_download_webpage(self, video_id):
1943                 """Report webpage download."""
1944                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1945
1946         def report_extraction(self, video_id):
1947                 """Report information extraction."""
1948                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1949
1950         def _real_initialize(self):
1951                 return
1952
1953         def _real_extract(self, url, new_video=True):
1954                 # Extract ID from URL
1955                 mobj = re.match(self._VALID_URL, url)
1956                 if mobj is None:
1957                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1958                         return
1959
1960                 # At this point we have a new video
1961                 self._downloader.increment_downloads()
1962                 video_id = mobj.group(1)
1963
1964                 # Retrieve video webpage to extract further information
1965                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1966                 try:
1967                         self.report_download_webpage(video_id)
1968                         webpage = urllib2.urlopen(request).read()
1969                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1970                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1971                         return
1972
1973                 # Now we begin extracting as much information as we can from what we
1974                 # retrieved. First we extract the information common to all extractors,
1975                 # and latter we extract those that are Vimeo specific.
1976                 self.report_extraction(video_id)
1977
1978                 # Extract title
1979                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1980                 if mobj is None:
1981                         self._downloader.trouble(u'ERROR: unable to extract video title')
1982                         return
1983                 video_title = mobj.group(1).decode('utf-8')
1984                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1985
1986                 # Extract uploader
1987                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1988                 if mobj is None:
1989                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1990                         return
1991                 video_uploader = mobj.group(1).decode('utf-8')
1992
1993                 # Extract video thumbnail
1994                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1995                 if mobj is None:
1996                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1997                         return
1998                 video_thumbnail = mobj.group(1).decode('utf-8')
1999
2000                 # # Extract video description
2001                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2002                 # if mobj is None:
2003                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2004                 #       return
2005                 # video_description = mobj.group(1).decode('utf-8')
2006                 # if not video_description: video_description = 'No description available.'
2007                 video_description = 'Foo.'
2008
2009                 # Vimeo specific: extract request signature
2010                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2011                 if mobj is None:
2012                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2013                         return
2014                 sig = mobj.group(1).decode('utf-8')
2015
2016                 # Vimeo specific: Extract request signature expiration
2017                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2018                 if mobj is None:
2019                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2020                         return
2021                 sig_exp = mobj.group(1).decode('utf-8')
2022
2023                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2024
2025                 try:
2026                         # Process video information
2027                         self._downloader.process_info({
2028                                 'id':           video_id.decode('utf-8'),
2029                                 'url':          video_url,
2030                                 'uploader':     video_uploader,
2031                                 'upload_date':  u'NA',
2032                                 'title':        video_title,
2033                                 'stitle':       simple_title,
2034                                 'ext':          u'mp4',
2035                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2036                                 'description':  video_description,
2037                                 'thumbnail':    video_thumbnail,
2038                                 'description':  video_description,
2039                                 'player_url':   None,
2040                         })
2041                 except UnavailableVideoError:
2042                         self._downloader.trouble(u'ERROR: unable to download video')
2043
2044
2045 class GenericIE(InfoExtractor):
2046         """Generic last-resort information extractor."""
2047
2048         def __init__(self, downloader=None):
2049                 InfoExtractor.__init__(self, downloader)
2050
2051         @staticmethod
2052         def suitable(url):
2053                 return True
2054
2055         def report_download_webpage(self, video_id):
2056                 """Report webpage download."""
2057                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2058                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2059
2060         def report_extraction(self, video_id):
2061                 """Report information extraction."""
2062                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2063
2064         def _real_initialize(self):
2065                 return
2066
2067         def _real_extract(self, url):
2068                 # At this point we have a new video
2069                 self._downloader.increment_downloads()
2070
2071                 video_id = url.split('/')[-1]
2072                 request = urllib2.Request(url)
2073                 try:
2074                         self.report_download_webpage(video_id)
2075                         webpage = urllib2.urlopen(request).read()
2076                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2077                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2078                         return
2079                 except ValueError, err:
2080                         # since this is the last-resort InfoExtractor, if
2081                         # this error is thrown, it'll be thrown here
2082                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2083                         return
2084
2085                 self.report_extraction(video_id)
2086                 # Start with something easy: JW Player in SWFObject
2087                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2088                 if mobj is None:
2089                         # Broaden the search a little bit
2090                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2091                 if mobj is None:
2092                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2093                         return
2094
2095                 # It's possible that one of the regexes
2096                 # matched, but returned an empty group:
2097                 if mobj.group(1) is None:
2098                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2099                         return
2100
2101                 video_url = urllib.unquote(mobj.group(1))
2102                 video_id = os.path.basename(video_url)
2103
2104                 # here's a fun little line of code for you:
2105                 video_extension = os.path.splitext(video_id)[1][1:]
2106                 video_id = os.path.splitext(video_id)[0]
2107
2108                 # it's tempting to parse this further, but you would
2109                 # have to take into account all the variations like
2110                 #   Video Title - Site Name
2111                 #   Site Name | Video Title
2112                 #   Video Title - Tagline | Site Name
2113                 # and so on and so forth; it's just not practical
2114                 mobj = re.search(r'<title>(.*)</title>', webpage)
2115                 if mobj is None:
2116                         self._downloader.trouble(u'ERROR: unable to extract title')
2117                         return
2118                 video_title = mobj.group(1).decode('utf-8')
2119                 video_title = sanitize_title(video_title)
2120                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2121
2122                 # video uploader is domain name
2123                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2124                 if mobj is None:
2125                         self._downloader.trouble(u'ERROR: unable to extract title')
2126                         return
2127                 video_uploader = mobj.group(1).decode('utf-8')
2128
2129                 try:
2130                         # Process video information
2131                         self._downloader.process_info({
2132                                 'id':           video_id.decode('utf-8'),
2133                                 'url':          video_url.decode('utf-8'),
2134                                 'uploader':     video_uploader,
2135                                 'upload_date':  u'NA',
2136                                 'title':        video_title,
2137                                 'stitle':       simple_title,
2138                                 'ext':          video_extension.decode('utf-8'),
2139                                 'format':       u'NA',
2140                                 'player_url':   None,
2141                         })
2142                 except UnavailableVideoError, err:
2143                         self._downloader.trouble(u'\nERROR: unable to download video')
2144
2145
2146 class YoutubeSearchIE(InfoExtractor):
2147         """Information Extractor for YouTube search queries."""
2148         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2149         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2150         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2151         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2152         _youtube_ie = None
2153         _max_youtube_results = 1000
2154
2155         def __init__(self, youtube_ie, downloader=None):
2156                 InfoExtractor.__init__(self, downloader)
2157                 self._youtube_ie = youtube_ie
2158
2159         @staticmethod
2160         def suitable(url):
2161                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2162
2163         def report_download_page(self, query, pagenum):
2164                 """Report attempt to download playlist page with given number."""
2165                 query = query.decode(preferredencoding())
2166                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2167
2168         def _real_initialize(self):
2169                 self._youtube_ie.initialize()
2170
2171         def _real_extract(self, query):
2172                 mobj = re.match(self._VALID_QUERY, query)
2173                 if mobj is None:
2174                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2175                         return
2176
2177                 prefix, query = query.split(':')
2178                 prefix = prefix[8:]
2179                 query = query.encode('utf-8')
2180                 if prefix == '':
2181                         self._download_n_results(query, 1)
2182                         return
2183                 elif prefix == 'all':
2184                         self._download_n_results(query, self._max_youtube_results)
2185                         return
2186                 else:
2187                         try:
2188                                 n = long(prefix)
2189                                 if n <= 0:
2190                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2191                                         return
2192                                 elif n > self._max_youtube_results:
2193                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2194                                         n = self._max_youtube_results
2195                                 self._download_n_results(query, n)
2196                                 return
2197                         except ValueError: # parsing prefix as integer fails
2198                                 self._download_n_results(query, 1)
2199                                 return
2200
2201         def _download_n_results(self, query, n):
2202                 """Downloads a specified number of results for a query"""
2203
2204                 video_ids = []
2205                 already_seen = set()
2206                 pagenum = 1
2207
2208                 while True:
2209                         self.report_download_page(query, pagenum)
2210                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2211                         request = urllib2.Request(result_url)
2212                         try:
2213                                 page = urllib2.urlopen(request).read()
2214                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2215                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2216                                 return
2217
2218                         # Extract video identifiers
2219                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2220                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2221                                 if video_id not in already_seen:
2222                                         video_ids.append(video_id)
2223                                         already_seen.add(video_id)
2224                                         if len(video_ids) == n:
2225                                                 # Specified n videos reached
2226                                                 for id in video_ids:
2227                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2228                                                 return
2229
2230                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2231                                 for id in video_ids:
2232                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2233                                 return
2234
2235                         pagenum = pagenum + 1
2236
2237
2238 class GoogleSearchIE(InfoExtractor):
2239         """Information Extractor for Google Video search queries."""
2240         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2241         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2242         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2243         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2244         _google_ie = None
2245         _max_google_results = 1000
2246
2247         def __init__(self, google_ie, downloader=None):
2248                 InfoExtractor.__init__(self, downloader)
2249                 self._google_ie = google_ie
2250
2251         @staticmethod
2252         def suitable(url):
2253                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2254
2255         def report_download_page(self, query, pagenum):
2256                 """Report attempt to download playlist page with given number."""
2257                 query = query.decode(preferredencoding())
2258                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2259
2260         def _real_initialize(self):
2261                 self._google_ie.initialize()
2262
2263         def _real_extract(self, query):
2264                 mobj = re.match(self._VALID_QUERY, query)
2265                 if mobj is None:
2266                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2267                         return
2268
2269                 prefix, query = query.split(':')
2270                 prefix = prefix[8:]
2271                 query = query.encode('utf-8')
2272                 if prefix == '':
2273                         self._download_n_results(query, 1)
2274                         return
2275                 elif prefix == 'all':
2276                         self._download_n_results(query, self._max_google_results)
2277                         return
2278                 else:
2279                         try:
2280                                 n = long(prefix)
2281                                 if n <= 0:
2282                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2283                                         return
2284                                 elif n > self._max_google_results:
2285                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2286                                         n = self._max_google_results
2287                                 self._download_n_results(query, n)
2288                                 return
2289                         except ValueError: # parsing prefix as integer fails
2290                                 self._download_n_results(query, 1)
2291                                 return
2292
2293         def _download_n_results(self, query, n):
2294                 """Downloads a specified number of results for a query"""
2295
2296                 video_ids = []
2297                 already_seen = set()
2298                 pagenum = 1
2299
2300                 while True:
2301                         self.report_download_page(query, pagenum)
2302                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2303                         request = urllib2.Request(result_url)
2304                         try:
2305                                 page = urllib2.urlopen(request).read()
2306                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2307                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2308                                 return
2309
2310                         # Extract video identifiers
2311                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2312                                 video_id = mobj.group(1)
2313                                 if video_id not in already_seen:
2314                                         video_ids.append(video_id)
2315                                         already_seen.add(video_id)
2316                                         if len(video_ids) == n:
2317                                                 # Specified n videos reached
2318                                                 for id in video_ids:
2319                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2320                                                 return
2321
2322                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2323                                 for id in video_ids:
2324                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2325                                 return
2326
2327                         pagenum = pagenum + 1
2328
2329
2330 class YahooSearchIE(InfoExtractor):
2331         """Information Extractor for Yahoo! Video search queries."""
2332         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2333         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2334         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2335         _MORE_PAGES_INDICATOR = r'\s*Next'
2336         _yahoo_ie = None
2337         _max_yahoo_results = 1000
2338
2339         def __init__(self, yahoo_ie, downloader=None):
2340                 InfoExtractor.__init__(self, downloader)
2341                 self._yahoo_ie = yahoo_ie
2342
2343         @staticmethod
2344         def suitable(url):
2345                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2346
2347         def report_download_page(self, query, pagenum):
2348                 """Report attempt to download playlist page with given number."""
2349                 query = query.decode(preferredencoding())
2350                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2351
2352         def _real_initialize(self):
2353                 self._yahoo_ie.initialize()
2354
2355         def _real_extract(self, query):
2356                 mobj = re.match(self._VALID_QUERY, query)
2357                 if mobj is None:
2358                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2359                         return
2360
2361                 prefix, query = query.split(':')
2362                 prefix = prefix[8:]
2363                 query = query.encode('utf-8')
2364                 if prefix == '':
2365                         self._download_n_results(query, 1)
2366                         return
2367                 elif prefix == 'all':
2368                         self._download_n_results(query, self._max_yahoo_results)
2369                         return
2370                 else:
2371                         try:
2372                                 n = long(prefix)
2373                                 if n <= 0:
2374                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2375                                         return
2376                                 elif n > self._max_yahoo_results:
2377                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2378                                         n = self._max_yahoo_results
2379                                 self._download_n_results(query, n)
2380                                 return
2381                         except ValueError: # parsing prefix as integer fails
2382                                 self._download_n_results(query, 1)
2383                                 return
2384
2385         def _download_n_results(self, query, n):
2386                 """Downloads a specified number of results for a query"""
2387
2388                 video_ids = []
2389                 already_seen = set()
2390                 pagenum = 1
2391
2392                 while True:
2393                         self.report_download_page(query, pagenum)
2394                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2395                         request = urllib2.Request(result_url)
2396                         try:
2397                                 page = urllib2.urlopen(request).read()
2398                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2399                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2400                                 return
2401
2402                         # Extract video identifiers
2403                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2404                                 video_id = mobj.group(1)
2405                                 if video_id not in already_seen:
2406                                         video_ids.append(video_id)
2407                                         already_seen.add(video_id)
2408                                         if len(video_ids) == n:
2409                                                 # Specified n videos reached
2410                                                 for id in video_ids:
2411                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2412                                                 return
2413
2414                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2415                                 for id in video_ids:
2416                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2417                                 return
2418
2419                         pagenum = pagenum + 1
2420
2421
2422 class YoutubePlaylistIE(InfoExtractor):
2423         """Information Extractor for YouTube playlists."""
2424
2425         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2426         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2427         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2428         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2429         _youtube_ie = None
2430
2431         def __init__(self, youtube_ie, downloader=None):
2432                 InfoExtractor.__init__(self, downloader)
2433                 self._youtube_ie = youtube_ie
2434
2435         @staticmethod
2436         def suitable(url):
2437                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2438
2439         def report_download_page(self, playlist_id, pagenum):
2440                 """Report attempt to download playlist page with given number."""
2441                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2442
2443         def _real_initialize(self):
2444                 self._youtube_ie.initialize()
2445
2446         def _real_extract(self, url):
2447                 # Extract playlist id
2448                 mobj = re.match(self._VALID_URL, url)
2449                 if mobj is None:
2450                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2451                         return
2452
2453                 # Single video case
2454                 if mobj.group(3) is not None:
2455                         self._youtube_ie.extract(mobj.group(3))
2456                         return
2457
2458                 # Download playlist pages
2459                 # prefix is 'p' as default for playlists but there are other types that need extra care
2460                 playlist_prefix = mobj.group(1)
2461                 if playlist_prefix == 'a':
2462                         playlist_access = 'artist'
2463                 else:
2464                         playlist_prefix = 'p'
2465                         playlist_access = 'view_play_list'
2466                 playlist_id = mobj.group(2)
2467                 video_ids = []
2468                 pagenum = 1
2469
2470                 while True:
2471                         self.report_download_page(playlist_id, pagenum)
2472                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2473                         try:
2474                                 page = urllib2.urlopen(request).read()
2475                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2476                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2477                                 return
2478
2479                         # Extract video identifiers
2480                         ids_in_page = []
2481                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2482                                 if mobj.group(1) not in ids_in_page:
2483                                         ids_in_page.append(mobj.group(1))
2484                         video_ids.extend(ids_in_page)
2485
2486                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2487                                 break
2488                         pagenum = pagenum + 1
2489
2490                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2491                 playlistend = self._downloader.params.get('playlistend', -1)
2492                 video_ids = video_ids[playliststart:playlistend]
2493
2494                 for id in video_ids:
2495                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2496                 return
2497
2498
2499 class YoutubeUserIE(InfoExtractor):
2500         """Information Extractor for YouTube users."""
2501
2502         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2503         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2504         _GDATA_PAGE_SIZE = 50
2505         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2506         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2507         _youtube_ie = None
2508
2509         def __init__(self, youtube_ie, downloader=None):
2510                 InfoExtractor.__init__(self, downloader)
2511                 self._youtube_ie = youtube_ie
2512
2513         @staticmethod
2514         def suitable(url):
2515                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2516
2517         def report_download_page(self, username, start_index):
2518                 """Report attempt to download user page."""
2519                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2520                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2521
2522         def _real_initialize(self):
2523                 self._youtube_ie.initialize()
2524
2525         def _real_extract(self, url):
2526                 # Extract username
2527                 mobj = re.match(self._VALID_URL, url)
2528                 if mobj is None:
2529                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2530                         return
2531
2532                 username = mobj.group(1)
2533
2534                 # Download video ids using YouTube Data API. Result size per
2535                 # query is limited (currently to 50 videos) so we need to query
2536                 # page by page until there are no video ids - it means we got
2537                 # all of them.
2538
2539                 video_ids = []
2540                 pagenum = 0
2541
2542                 while True:
2543                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2544                         self.report_download_page(username, start_index)
2545
2546                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2547
2548                         try:
2549                                 page = urllib2.urlopen(request).read()
2550                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2551                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2552                                 return
2553
2554                         # Extract video identifiers
2555                         ids_in_page = []
2556
2557                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2558                                 if mobj.group(1) not in ids_in_page:
2559                                         ids_in_page.append(mobj.group(1))
2560
2561                         video_ids.extend(ids_in_page)
2562
2563                         # A little optimization - if current page is not
2564                         # "full", ie. does not contain PAGE_SIZE video ids then
2565                         # we can assume that this page is the last one - there
2566                         # are no more ids on further pages - no need to query
2567                         # again.
2568
2569                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2570                                 break
2571
2572                         pagenum += 1
2573
2574                 all_ids_count = len(video_ids)
2575                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2576                 playlistend = self._downloader.params.get('playlistend', -1)
2577
2578                 if playlistend == -1:
2579                         video_ids = video_ids[playliststart:]
2580                 else:
2581                         video_ids = video_ids[playliststart:playlistend]
2582
2583                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2584                                 (username, all_ids_count, len(video_ids)))
2585
2586                 for video_id in video_ids:
2587                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2588
2589
2590 class DepositFilesIE(InfoExtractor):
2591         """Information extractor for depositfiles.com"""
2592
2593         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2594
2595         def __init__(self, downloader=None):
2596                 InfoExtractor.__init__(self, downloader)
2597
2598         @staticmethod
2599         def suitable(url):
2600                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2601
2602         def report_download_webpage(self, file_id):
2603                 """Report webpage download."""
2604                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2605
2606         def report_extraction(self, file_id):
2607                 """Report information extraction."""
2608                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2609
2610         def _real_initialize(self):
2611                 return
2612
2613         def _real_extract(self, url):
2614                 # At this point we have a new file
2615                 self._downloader.increment_downloads()
2616
2617                 file_id = url.split('/')[-1]
2618                 # Rebuild url in english locale
2619                 url = 'http://depositfiles.com/en/files/' + file_id
2620
2621                 # Retrieve file webpage with 'Free download' button pressed
2622                 free_download_indication = { 'gateway_result' : '1' }
2623                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2624                 try:
2625                         self.report_download_webpage(file_id)
2626                         webpage = urllib2.urlopen(request).read()
2627                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2628                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2629                         return
2630
2631                 # Search for the real file URL
2632                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2633                 if (mobj is None) or (mobj.group(1) is None):
2634                         # Try to figure out reason of the error.
2635                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2636                         if (mobj is not None) and (mobj.group(1) is not None):
2637                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2638                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2639                         else:
2640                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2641                         return
2642
2643                 file_url = mobj.group(1)
2644                 file_extension = os.path.splitext(file_url)[1][1:]
2645
2646                 # Search for file title
2647                 mobj = re.search(r'<b title="(.*?)">', webpage)
2648                 if mobj is None:
2649                         self._downloader.trouble(u'ERROR: unable to extract title')
2650                         return
2651                 file_title = mobj.group(1).decode('utf-8')
2652
2653                 try:
2654                         # Process file information
2655                         self._downloader.process_info({
2656                                 'id':           file_id.decode('utf-8'),
2657                                 'url':          file_url.decode('utf-8'),
2658                                 'uploader':     u'NA',
2659                                 'upload_date':  u'NA',
2660                                 'title':        file_title,
2661                                 'stitle':       file_title,
2662                                 'ext':          file_extension.decode('utf-8'),
2663                                 'format':       u'NA',
2664                                 'player_url':   None,
2665                         })
2666                 except UnavailableVideoError, err:
2667                         self._downloader.trouble(u'ERROR: unable to download file')
2668
2669
2670 class FacebookIE(InfoExtractor):
2671         """Information Extractor for Facebook"""
2672
2673         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2674         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2675         _NETRC_MACHINE = 'facebook'
2676         _available_formats = ['highqual', 'lowqual']
2677         _video_extensions = {
2678                 'highqual': 'mp4',
2679                 'lowqual': 'mp4',
2680         }
2681
2682         def __init__(self, downloader=None):
2683                 InfoExtractor.__init__(self, downloader)
2684
2685         @staticmethod
2686         def suitable(url):
2687                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2688
2689         def _reporter(self, message):
2690                 """Add header and report message."""
2691                 self._downloader.to_screen(u'[facebook] %s' % message)
2692
2693         def report_login(self):
2694                 """Report attempt to log in."""
2695                 self._reporter(u'Logging in')
2696
2697         def report_video_webpage_download(self, video_id):
2698                 """Report attempt to download video webpage."""
2699                 self._reporter(u'%s: Downloading video webpage' % video_id)
2700
2701         def report_information_extraction(self, video_id):
2702                 """Report attempt to extract video information."""
2703                 self._reporter(u'%s: Extracting video information' % video_id)
2704
2705         def _parse_page(self, video_webpage):
2706                 """Extract video information from page"""
2707                 # General data
2708                 data = {'title': r'class="video_title datawrap">(.*?)</',
2709                         'description': r'<div class="datawrap">(.*?)</div>',
2710                         'owner': r'\("video_owner_name", "(.*?)"\)',
2711                         'upload_date': r'data-date="(.*?)"',
2712                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2713                         }
2714                 video_info = {}
2715                 for piece in data.keys():
2716                         mobj = re.search(data[piece], video_webpage)
2717                         if mobj is not None:
2718                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2719
2720                 # Video urls
2721                 video_urls = {}
2722                 for fmt in self._available_formats:
2723                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2724                         if mobj is not None:
2725                                 # URL is in a Javascript segment inside an escaped Unicode format within
2726                                 # the generally utf-8 page
2727                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2728                 video_info['video_urls'] = video_urls
2729
2730                 return video_info
2731
2732         def _real_initialize(self):
2733                 if self._downloader is None:
2734                         return
2735
2736                 useremail = None
2737                 password = None
2738                 downloader_params = self._downloader.params
2739
2740                 # Attempt to use provided username and password or .netrc data
2741                 if downloader_params.get('username', None) is not None:
2742                         useremail = downloader_params['username']
2743                         password = downloader_params['password']
2744                 elif downloader_params.get('usenetrc', False):
2745                         try:
2746                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2747                                 if info is not None:
2748                                         useremail = info[0]
2749                                         password = info[2]
2750                                 else:
2751                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2752                         except (IOError, netrc.NetrcParseError), err:
2753                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2754                                 return
2755
2756                 if useremail is None:
2757                         return
2758
2759                 # Log in
2760                 login_form = {
2761                         'email': useremail,
2762                         'pass': password,
2763                         'login': 'Log+In'
2764                         }
2765                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2766                 try:
2767                         self.report_login()
2768                         login_results = urllib2.urlopen(request).read()
2769                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2770                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2771                                 return
2772                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2773                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2774                         return
2775
2776         def _real_extract(self, url):
2777                 mobj = re.match(self._VALID_URL, url)
2778                 if mobj is None:
2779                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2780                         return
2781                 video_id = mobj.group('ID')
2782
2783                 # Get video webpage
2784                 self.report_video_webpage_download(video_id)
2785                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2786                 try:
2787                         page = urllib2.urlopen(request)
2788                         video_webpage = page.read()
2789                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2790                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2791                         return
2792
2793                 # Start extracting information
2794                 self.report_information_extraction(video_id)
2795
2796                 # Extract information
2797                 video_info = self._parse_page(video_webpage)
2798
2799                 # uploader
2800                 if 'owner' not in video_info:
2801                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2802                         return
2803                 video_uploader = video_info['owner']
2804
2805                 # title
2806                 if 'title' not in video_info:
2807                         self._downloader.trouble(u'ERROR: unable to extract video title')
2808                         return
2809                 video_title = video_info['title']
2810                 video_title = video_title.decode('utf-8')
2811                 video_title = sanitize_title(video_title)
2812
2813                 # simplified title
2814                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2815                 simple_title = simple_title.strip(ur'_')
2816
2817                 # thumbnail image
2818                 if 'thumbnail' not in video_info:
2819                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2820                         video_thumbnail = ''
2821                 else:
2822                         video_thumbnail = video_info['thumbnail']
2823
2824                 # upload date
2825                 upload_date = u'NA'
2826                 if 'upload_date' in video_info:
2827                         upload_time = video_info['upload_date']
2828                         timetuple = email.utils.parsedate_tz(upload_time)
2829                         if timetuple is not None:
2830                                 try:
2831                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2832                                 except:
2833                                         pass
2834
2835                 # description
2836                 video_description = video_info.get('description', 'No description available.')
2837
2838                 url_map = video_info['video_urls']
2839                 if len(url_map.keys()) > 0:
2840                         # Decide which formats to download
2841                         req_format = self._downloader.params.get('format', None)
2842                         format_limit = self._downloader.params.get('format_limit', None)
2843
2844                         if format_limit is not None and format_limit in self._available_formats:
2845                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2846                         else:
2847                                 format_list = self._available_formats
2848                         existing_formats = [x for x in format_list if x in url_map]
2849                         if len(existing_formats) == 0:
2850                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2851                                 return
2852                         if req_format is None:
2853                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2854                         elif req_format == '-1':
2855                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2856                         else:
2857                                 # Specific format
2858                                 if req_format not in url_map:
2859                                         self._downloader.trouble(u'ERROR: requested format not available')
2860                                         return
2861                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2862
2863                 for format_param, video_real_url in video_url_list:
2864
2865                         # At this point we have a new video
2866                         self._downloader.increment_downloads()
2867
2868                         # Extension
2869                         video_extension = self._video_extensions.get(format_param, 'mp4')
2870
2871                         try:
2872                                 # Process video information
2873                                 self._downloader.process_info({
2874                                         'id':           video_id.decode('utf-8'),
2875                                         'url':          video_real_url.decode('utf-8'),
2876                                         'uploader':     video_uploader.decode('utf-8'),
2877                                         'upload_date':  upload_date,
2878                                         'title':        video_title,
2879                                         'stitle':       simple_title,
2880                                         'ext':          video_extension.decode('utf-8'),
2881                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2882                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2883                                         'description':  video_description.decode('utf-8'),
2884                                         'player_url':   None,
2885                                 })
2886                         except UnavailableVideoError, err:
2887                                 self._downloader.trouble(u'\nERROR: unable to download video')
2888
2889 class BlipTVIE(InfoExtractor):
2890         """Information extractor for blip.tv"""
2891
2892         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2893         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2894
2895         @staticmethod
2896         def suitable(url):
2897                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2898
2899         def report_extraction(self, file_id):
2900                 """Report information extraction."""
2901                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2902
2903         def _simplify_title(self, title):
2904                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2905                 res = res.strip(ur'_')
2906                 return res
2907
2908         def _real_extract(self, url):
2909                 mobj = re.match(self._VALID_URL, url)
2910                 if mobj is None:
2911                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2912                         return
2913
2914                 if '?' in url:
2915                         cchar = '&'
2916                 else:
2917                         cchar = '?'
2918                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2919                 request = urllib2.Request(json_url)
2920                 self.report_extraction(mobj.group(1))
2921                 try:
2922                         json_code = urllib2.urlopen(request).read()
2923                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2924                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2925                         return
2926                 try:
2927                         json_data = json.loads(json_code)
2928                         if 'Post' in json_data:
2929                                 data = json_data['Post']
2930                         else:
2931                                 data = json_data
2932
2933                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2934                         video_url = data['media']['url']
2935                         umobj = re.match(self._URL_EXT, video_url)
2936                         if umobj is None:
2937                                 raise ValueError('Can not determine filename extension')
2938                         ext = umobj.group(1)
2939
2940                         self._downloader.increment_downloads()
2941
2942                         info = {
2943                                 'id': data['item_id'],
2944                                 'url': video_url,
2945                                 'uploader': data['display_name'],
2946                                 'upload_date': upload_date,
2947                                 'title': data['title'],
2948                                 'stitle': self._simplify_title(data['title']),
2949                                 'ext': ext,
2950                                 'format': data['media']['mimeType'],
2951                                 'thumbnail': data['thumbnailUrl'],
2952                                 'description': data['description'],
2953                                 'player_url': data['embedUrl']
2954                         }
2955                 except (ValueError,KeyError), err:
2956                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2957                         return
2958
2959                 try:
2960                         self._downloader.process_info(info)
2961                 except UnavailableVideoError, err:
2962                         self._downloader.trouble(u'\nERROR: unable to download video')
2963
2964
2965 class PostProcessor(object):
2966         """Post Processor class.
2967
2968         PostProcessor objects can be added to downloaders with their
2969         add_post_processor() method. When the downloader has finished a
2970         successful download, it will take its internal chain of PostProcessors
2971         and start calling the run() method on each one of them, first with
2972         an initial argument and then with the returned value of the previous
2973         PostProcessor.
2974
2975         The chain will be stopped if one of them ever returns None or the end
2976         of the chain is reached.
2977
2978         PostProcessor objects follow a "mutual registration" process similar
2979         to InfoExtractor objects.
2980         """
2981
2982         _downloader = None
2983
2984         def __init__(self, downloader=None):
2985                 self._downloader = downloader
2986
2987         def set_downloader(self, downloader):
2988                 """Sets the downloader for this PP."""
2989                 self._downloader = downloader
2990
2991         def run(self, information):
2992                 """Run the PostProcessor.
2993
2994                 The "information" argument is a dictionary like the ones
2995                 composed by InfoExtractors. The only difference is that this
2996                 one has an extra field called "filepath" that points to the
2997                 downloaded file.
2998
2999                 When this method returns None, the postprocessing chain is
3000                 stopped. However, this method may return an information
3001                 dictionary that will be passed to the next postprocessing
3002                 object in the chain. It can be the one it received after
3003                 changing some fields.
3004
3005                 In addition, this method may raise a PostProcessingError
3006                 exception that will be taken into account by the downloader
3007                 it was called from.
3008                 """
3009                 return information # by default, do nothing
3010
3011
3012 class FFmpegExtractAudioPP(PostProcessor):
3013
3014         def __init__(self, downloader=None, preferredcodec=None):
3015                 PostProcessor.__init__(self, downloader)
3016                 if preferredcodec is None:
3017                         preferredcodec = 'best'
3018                 self._preferredcodec = preferredcodec
3019
3020         @staticmethod
3021         def get_audio_codec(path):
3022                 try:
3023                         cmd = ['ffprobe', '-show_streams', '--', path]
3024                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3025                         output = handle.communicate()[0]
3026                         if handle.wait() != 0:
3027                                 return None
3028                 except (IOError, OSError):
3029                         return None
3030                 audio_codec = None
3031                 for line in output.split('\n'):
3032                         if line.startswith('codec_name='):
3033                                 audio_codec = line.split('=')[1].strip()
3034                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3035                                 return audio_codec
3036                 return None
3037
3038         @staticmethod
3039         def run_ffmpeg(path, out_path, codec, more_opts):
3040                 try:
3041                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3042                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3043                         return (ret == 0)
3044                 except (IOError, OSError):
3045                         return False
3046
3047         def run(self, information):
3048                 path = information['filepath']
3049
3050                 filecodec = self.get_audio_codec(path)
3051                 if filecodec is None:
3052                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3053                         return None
3054
3055                 more_opts = []
3056                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3057                         if filecodec == 'aac' or filecodec == 'mp3':
3058                                 # Lossless if possible
3059                                 acodec = 'copy'
3060                                 extension = filecodec
3061                                 if filecodec == 'aac':
3062                                         more_opts = ['-f', 'adts']
3063                         else:
3064                                 # MP3 otherwise.
3065                                 acodec = 'libmp3lame'
3066                                 extension = 'mp3'
3067                                 more_opts = ['-ab', '128k']
3068                 else:
3069                         # We convert the audio (lossy)
3070                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3071                         extension = self._preferredcodec
3072                         more_opts = ['-ab', '128k']
3073                         if self._preferredcodec == 'aac':
3074                                 more_opts += ['-f', 'adts']
3075
3076                 (prefix, ext) = os.path.splitext(path)
3077                 new_path = prefix + '.' + extension
3078                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3079                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3080
3081                 if not status:
3082                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3083                         return None
3084
3085                 try:
3086                         os.remove(path)
3087                 except (IOError, OSError):
3088                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3089                         return None
3090
3091                 information['filepath'] = new_path
3092                 return information
3093
3094
3095 def updateSelf(downloader, filename):
3096         ''' Update the program file with the latest version from the repository '''
3097         # Note: downloader only used for options
3098         if not os.access(filename, os.W_OK):
3099                 sys.exit('ERROR: no write permissions on %s' % filename)
3100
3101         downloader.to_screen('Updating to latest version...')
3102
3103         try:
3104                 try:
3105                         urlh = urllib.urlopen(UPDATE_URL)
3106                         newcontent = urlh.read()
3107                 finally:
3108                         urlh.close()
3109         except (IOError, OSError), err:
3110                 sys.exit('ERROR: unable to download latest version')
3111
3112         try:
3113                 outf = open(filename, 'wb')
3114                 try:
3115                         outf.write(newcontent)
3116                 finally:
3117                         outf.close()
3118         except (IOError, OSError), err:
3119                 sys.exit('ERROR: unable to overwrite current version')
3120
3121         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3122
3123 def parseOpts():
3124         # Deferred imports
3125         import getpass
3126         import optparse
3127
3128         def _format_option_string(option):
3129                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3130
3131                 opts = []
3132
3133                 if option._short_opts: opts.append(option._short_opts[0])
3134                 if option._long_opts: opts.append(option._long_opts[0])
3135                 if len(opts) > 1: opts.insert(1, ', ')
3136
3137                 if option.takes_value(): opts.append(' %s' % option.metavar)
3138
3139                 return "".join(opts)
3140
3141         def _find_term_columns():
3142                 columns = os.environ.get('COLUMNS', None)
3143                 if columns:
3144                         return int(columns)
3145
3146                 try:
3147                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3148                         out,err = sp.communicate()
3149                         return int(out.split()[1])
3150                 except:
3151                         pass
3152                 return None
3153
3154         max_width = 80
3155         max_help_position = 80
3156
3157         # No need to wrap help messages if we're on a wide console
3158         columns = _find_term_columns()
3159         if columns: max_width = columns
3160
3161         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3162         fmt.format_option_strings = _format_option_string
3163
3164         kw = {
3165                 'version'   : __version__,
3166                 'formatter' : fmt,
3167                 'usage' : '%prog [options] url...',
3168                 'conflict_handler' : 'resolve',
3169         }
3170
3171         parser = optparse.OptionParser(**kw)
3172
3173         # option groups
3174         general        = optparse.OptionGroup(parser, 'General Options')
3175         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3176         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3177         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3178         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3179         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3180
3181         general.add_option('-h', '--help',
3182                         action='help', help='print this help text and exit')
3183         general.add_option('-v', '--version',
3184                         action='version', help='print program version and exit')
3185         general.add_option('-U', '--update',
3186                         action='store_true', dest='update_self', help='update this program to latest version')
3187         general.add_option('-i', '--ignore-errors',
3188                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3189         general.add_option('-r', '--rate-limit',
3190                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3191         general.add_option('-R', '--retries',
3192                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3193         general.add_option('--playlist-start',
3194                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3195         general.add_option('--playlist-end',
3196                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3197         general.add_option('--dump-user-agent',
3198                         action='store_true', dest='dump_user_agent',
3199                         help='display the current browser identification', default=False)
3200
3201         authentication.add_option('-u', '--username',
3202                         dest='username', metavar='USERNAME', help='account username')
3203         authentication.add_option('-p', '--password',
3204                         dest='password', metavar='PASSWORD', help='account password')
3205         authentication.add_option('-n', '--netrc',
3206                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3207
3208
3209         video_format.add_option('-f', '--format',
3210                         action='store', dest='format', metavar='FORMAT', help='video format code')
3211         video_format.add_option('--all-formats',
3212                         action='store_const', dest='format', help='download all available video formats', const='-1')
3213         video_format.add_option('--max-quality',
3214                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3215
3216
3217         verbosity.add_option('-q', '--quiet',
3218                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3219         verbosity.add_option('-s', '--simulate',
3220                         action='store_true', dest='simulate', help='do not download video', default=False)
3221         verbosity.add_option('-g', '--get-url',
3222                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3223         verbosity.add_option('-e', '--get-title',
3224                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3225         verbosity.add_option('--get-thumbnail',
3226                         action='store_true', dest='getthumbnail',
3227                         help='simulate, quiet but print thumbnail URL', default=False)
3228         verbosity.add_option('--get-description',
3229                         action='store_true', dest='getdescription',
3230                         help='simulate, quiet but print video description', default=False)
3231         verbosity.add_option('--get-filename',
3232                         action='store_true', dest='getfilename',
3233                         help='simulate, quiet but print output filename', default=False)
3234         verbosity.add_option('--no-progress',
3235                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3236         verbosity.add_option('--console-title',
3237                         action='store_true', dest='consoletitle',
3238                         help='display progress in console titlebar', default=False)
3239
3240
3241         filesystem.add_option('-t', '--title',
3242                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3243         filesystem.add_option('-l', '--literal',
3244                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3245         filesystem.add_option('-A', '--auto-number',
3246                         action='store_true', dest='autonumber',
3247                         help='number downloaded files starting from 00000', default=False)
3248         filesystem.add_option('-o', '--output',
3249                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3250         filesystem.add_option('-a', '--batch-file',
3251                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3252         filesystem.add_option('-w', '--no-overwrites',
3253                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3254         filesystem.add_option('-c', '--continue',
3255                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3256         filesystem.add_option('--cookies',
3257                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3258         filesystem.add_option('--no-part',
3259                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3260         filesystem.add_option('--no-mtime',
3261                         action='store_false', dest='updatetime',
3262                         help='do not use the Last-modified header to set the file modification time', default=True)
3263         filesystem.add_option('--write-description',
3264                         action='store_true', dest='writedescription',
3265                         help='write video description to a .description file', default=False)
3266         filesystem.add_option('--write-info-json',
3267                         action='store_true', dest='writeinfojson',
3268                         help='write video metadata to a .info.json file', default=False)
3269
3270
3271         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3272                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3273         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3274                         help='"best", "aac" or "mp3"; best by default')
3275
3276
3277         parser.add_option_group(general)
3278         parser.add_option_group(filesystem)
3279         parser.add_option_group(verbosity)
3280         parser.add_option_group(video_format)
3281         parser.add_option_group(authentication)
3282         parser.add_option_group(postproc)
3283
3284         opts, args = parser.parse_args()
3285
3286         return parser, opts, args
3287
3288 def main():
3289         parser, opts, args = parseOpts()
3290
3291         # Open appropriate CookieJar
3292         if opts.cookiefile is None:
3293                 jar = cookielib.CookieJar()
3294         else:
3295                 try:
3296                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3297                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3298                                 jar.load()
3299                 except (IOError, OSError), err:
3300                         sys.exit(u'ERROR: unable to open cookie file')
3301
3302         # Dump user agent
3303         if opts.dump_user_agent:
3304                 print std_headers['User-Agent']
3305                 sys.exit(0)
3306
3307         # General configuration
3308         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3309         urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3310         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3311
3312         # Batch file verification
3313         batchurls = []
3314         if opts.batchfile is not None:
3315                 try:
3316                         if opts.batchfile == '-':
3317                                 batchfd = sys.stdin
3318                         else:
3319                                 batchfd = open(opts.batchfile, 'r')
3320                         batchurls = batchfd.readlines()
3321                         batchurls = [x.strip() for x in batchurls]
3322                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3323                 except IOError:
3324                         sys.exit(u'ERROR: batch file could not be read')
3325         all_urls = batchurls + args
3326
3327         # Conflicting, missing and erroneous options
3328         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3329                 parser.error(u'using .netrc conflicts with giving username/password')
3330         if opts.password is not None and opts.username is None:
3331                 parser.error(u'account username missing')
3332         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3333                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3334         if opts.usetitle and opts.useliteral:
3335                 parser.error(u'using title conflicts with using literal title')
3336         if opts.username is not None and opts.password is None:
3337                 opts.password = getpass.getpass(u'Type account password and press return:')
3338         if opts.ratelimit is not None:
3339                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3340                 if numeric_limit is None:
3341                         parser.error(u'invalid rate limit specified')
3342                 opts.ratelimit = numeric_limit
3343         if opts.retries is not None:
3344                 try:
3345                         opts.retries = long(opts.retries)
3346                 except (TypeError, ValueError), err:
3347                         parser.error(u'invalid retry count specified')
3348         try:
3349                 opts.playliststart = int(opts.playliststart)
3350                 if opts.playliststart <= 0:
3351                         raise ValueError(u'Playlist start must be positive')
3352         except (TypeError, ValueError), err:
3353                 parser.error(u'invalid playlist start number specified')
3354         try:
3355                 opts.playlistend = int(opts.playlistend)
3356                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3357                         raise ValueError(u'Playlist end must be greater than playlist start')
3358         except (TypeError, ValueError), err:
3359                 parser.error(u'invalid playlist end number specified')
3360         if opts.extractaudio:
3361                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3362                         parser.error(u'invalid audio format specified')
3363
3364         # Information extractors
3365         youtube_ie = YoutubeIE()
3366         metacafe_ie = MetacafeIE(youtube_ie)
3367         dailymotion_ie = DailymotionIE()
3368         youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3369         youtube_user_ie = YoutubeUserIE(youtube_ie)
3370         youtube_search_ie = YoutubeSearchIE(youtube_ie)
3371         google_ie = GoogleIE()
3372         google_search_ie = GoogleSearchIE(google_ie)
3373         photobucket_ie = PhotobucketIE()
3374         yahoo_ie = YahooIE()
3375         yahoo_search_ie = YahooSearchIE(yahoo_ie)
3376         deposit_files_ie = DepositFilesIE()
3377         facebook_ie = FacebookIE()
3378         bliptv_ie = BlipTVIE()
3379         vimeo_ie = VimeoIE()
3380         generic_ie = GenericIE()
3381
3382         # File downloader
3383         fd = FileDownloader({
3384                 'usenetrc': opts.usenetrc,
3385                 'username': opts.username,
3386                 'password': opts.password,
3387                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3388                 'forceurl': opts.geturl,
3389                 'forcetitle': opts.gettitle,
3390                 'forcethumbnail': opts.getthumbnail,
3391                 'forcedescription': opts.getdescription,
3392                 'forcefilename': opts.getfilename,
3393                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3394                 'format': opts.format,
3395                 'format_limit': opts.format_limit,
3396                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3397                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3398                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3399                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3400                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3401                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3402                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3403                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3404                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3405                         or u'%(id)s.%(ext)s'),
3406                 'ignoreerrors': opts.ignoreerrors,
3407                 'ratelimit': opts.ratelimit,
3408                 'nooverwrites': opts.nooverwrites,
3409                 'retries': opts.retries,
3410                 'continuedl': opts.continue_dl,
3411                 'noprogress': opts.noprogress,
3412                 'playliststart': opts.playliststart,
3413                 'playlistend': opts.playlistend,
3414                 'logtostderr': opts.outtmpl == '-',
3415                 'consoletitle': opts.consoletitle,
3416                 'nopart': opts.nopart,
3417                 'updatetime': opts.updatetime,
3418                 'writedescription': opts.writedescription,
3419                 'writeinfojson': opts.writeinfojson,
3420                 })
3421         fd.add_info_extractor(youtube_search_ie)
3422         fd.add_info_extractor(youtube_pl_ie)
3423         fd.add_info_extractor(youtube_user_ie)
3424         fd.add_info_extractor(metacafe_ie)
3425         fd.add_info_extractor(dailymotion_ie)
3426         fd.add_info_extractor(youtube_ie)
3427         fd.add_info_extractor(google_ie)
3428         fd.add_info_extractor(google_search_ie)
3429         fd.add_info_extractor(photobucket_ie)
3430         fd.add_info_extractor(yahoo_ie)
3431         fd.add_info_extractor(yahoo_search_ie)
3432         fd.add_info_extractor(deposit_files_ie)
3433         fd.add_info_extractor(facebook_ie)
3434         fd.add_info_extractor(bliptv_ie)
3435         fd.add_info_extractor(vimeo_ie)
3436
3437         # This must come last since it's the
3438         # fallback if none of the others work
3439         fd.add_info_extractor(generic_ie)
3440
3441         # PostProcessors
3442         if opts.extractaudio:
3443                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3444
3445         # Update version
3446         if opts.update_self:
3447                 updateSelf(fd, sys.argv[0])
3448
3449         # Maybe do nothing
3450         if len(all_urls) < 1:
3451                 if not opts.update_self:
3452                         parser.error(u'you must provide at least one URL')
3453                 else:
3454                         sys.exit()
3455         retcode = fd.download(all_urls)
3456
3457         # Dump cookie jar if requested
3458         if opts.cookiefile is not None:
3459                 try:
3460                         jar.save()
3461                 except (IOError, OSError), err:
3462                         sys.exit(u'ERROR: unable to save cookie jar')
3463
3464         sys.exit(retcode)
3465
3466
3467 if __name__ == '__main__':
3468         try:
3469                 main()
3470         except DownloadError:
3471                 sys.exit(1)
3472         except SameFileError:
3473                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3474         except KeyboardInterrupt:
3475                 sys.exit(u'\nERROR: Interrupted by user')
3476
3477 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: