--list-extractors (Closes #161)
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         )
16
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.15'
19
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
21
22 import cookielib
23 import datetime
24 import gzip
25 import htmlentitydefs
26 import HTMLParser
27 import httplib
28 import locale
29 import math
30 import netrc
31 import os
32 import os.path
33 import re
34 import socket
35 import string
36 import subprocess
37 import sys
38 import time
39 import urllib
40 import urllib2
41 import warnings
42 import zlib
43
44 if os.name == 'nt':
45         import ctypes
46
47 try:
48         import email.utils
49 except ImportError: # Python 2.4
50         import email.Utils
51 try:
52         import cStringIO as StringIO
53 except ImportError:
54         import StringIO
55
56 # parse_qs was moved from the cgi module to the urlparse module recently.
57 try:
58         from urlparse import parse_qs
59 except ImportError:
60         from cgi import parse_qs
61
62 try:
63         import lxml.etree
64 except ImportError:
65         pass # Handled below
66
67 try:
68         import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
71
72 std_headers = {
73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76         'Accept-Encoding': 'gzip, deflate',
77         'Accept-Language': 'en-us,en;q=0.5',
78 }
79
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
82 try:
83         import json
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
85         import re
86         class json(object):
87                 @staticmethod
88                 def loads(s):
89                         s = s.decode('UTF-8')
90                         def raiseError(msg, i):
91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92                         def skipSpace(i, expectMore=True):
93                                 while i < len(s) and s[i] in ' \t\r\n':
94                                         i += 1
95                                 if expectMore:
96                                         if i >= len(s):
97                                                 raiseError('Premature end', i)
98                                 return i
99                         def decodeEscape(match):
100                                 esc = match.group(1)
101                                 _STATIC = {
102                                         '"': '"',
103                                         '\\': '\\',
104                                         '/': '/',
105                                         'b': unichr(0x8),
106                                         'f': unichr(0xc),
107                                         'n': '\n',
108                                         'r': '\r',
109                                         't': '\t',
110                                 }
111                                 if esc in _STATIC:
112                                         return _STATIC[esc]
113                                 if esc[0] == 'u':
114                                         if len(esc) == 1+4:
115                                                 return unichr(int(esc[1:5], 16))
116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
117                                                 hi = int(esc[1:5], 16)
118                                                 low = int(esc[7:11], 16)
119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120                                 raise ValueError('Unknown escape ' + str(esc))
121                         def parseString(i):
122                                 i += 1
123                                 e = i
124                                 while True:
125                                         e = s.index('"', e)
126                                         bslashes = 0
127                                         while s[e-bslashes-1] == '\\':
128                                                 bslashes += 1
129                                         if bslashes % 2 == 1:
130                                                 e += 1
131                                                 continue
132                                         break
133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134                                 stri = rexp.sub(decodeEscape, s[i:e])
135                                 return (e+1,stri)
136                         def parseObj(i):
137                                 i += 1
138                                 res = {}
139                                 i = skipSpace(i)
140                                 if s[i] == '}': # Empty dictionary
141                                         return (i+1,res)
142                                 while True:
143                                         if s[i] != '"':
144                                                 raiseError('Expected a string object key', i)
145                                         i,key = parseString(i)
146                                         i = skipSpace(i)
147                                         if i >= len(s) or s[i] != ':':
148                                                 raiseError('Expected a colon', i)
149                                         i,val = parse(i+1)
150                                         res[key] = val
151                                         i = skipSpace(i)
152                                         if s[i] == '}':
153                                                 return (i+1, res)
154                                         if s[i] != ',':
155                                                 raiseError('Expected comma or closing curly brace', i)
156                                         i = skipSpace(i+1)
157                         def parseArray(i):
158                                 res = []
159                                 i = skipSpace(i+1)
160                                 if s[i] == ']': # Empty array
161                                         return (i+1,res)
162                                 while True:
163                                         i,val = parse(i)
164                                         res.append(val)
165                                         i = skipSpace(i) # Raise exception if premature end
166                                         if s[i] == ']':
167                                                 return (i+1, res)
168                                         if s[i] != ',':
169                                                 raiseError('Expected a comma or closing bracket', i)
170                                         i = skipSpace(i+1)
171                         def parseDiscrete(i):
172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
173                                         if s.startswith(k, i):
174                                                 return (i+len(k), v)
175                                 raiseError('Not a boolean (or null)', i)
176                         def parseNumber(i):
177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178                                 if mobj is None:
179                                         raiseError('Not a number', i)
180                                 nums = mobj.group(1)
181                                 if '.' in nums or 'e' in nums or 'E' in nums:
182                                         return (i+len(nums), float(nums))
183                                 return (i+len(nums), int(nums))
184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185                         def parse(i):
186                                 i = skipSpace(i)
187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
188                                 i = skipSpace(i, False)
189                                 return (i,res)
190                         i,res = parse(0)
191                         if i < len(s):
192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193                         return res
194
195 def preferredencoding():
196         """Get preferred encoding.
197
198         Returns the best encoding scheme for the system, based on
199         locale.getpreferredencoding() and some further tweaks.
200         """
201         def yield_preferredencoding():
202                 try:
203                         pref = locale.getpreferredencoding()
204                         u'TEST'.encode(pref)
205                 except:
206                         pref = 'UTF-8'
207                 while True:
208                         yield pref
209         return yield_preferredencoding().next()
210
211
212 def htmlentity_transform(matchobj):
213         """Transforms an HTML entity to a Unicode character.
214
215         This function receives a match object and is intended to be used with
216         the re.sub() function.
217         """
218         entity = matchobj.group(1)
219
220         # Known non-numeric HTML entity
221         if entity in htmlentitydefs.name2codepoint:
222                 return unichr(htmlentitydefs.name2codepoint[entity])
223
224         # Unicode character
225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
226         if mobj is not None:
227                 numstr = mobj.group(1)
228                 if numstr.startswith(u'x'):
229                         base = 16
230                         numstr = u'0%s' % numstr
231                 else:
232                         base = 10
233                 return unichr(long(numstr, base))
234
235         # Unknown entity in name, return its literal representation
236         return (u'&%s;' % entity)
237
238
239 def sanitize_title(utitle):
240         """Sanitizes a video title so it could be used as part of a filename."""
241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242         return utitle.replace(unicode(os.sep), u'%')
243
244
245 def sanitize_open(filename, open_mode):
246         """Try to open the given filename, and slightly tweak it if this fails.
247
248         Attempts to open the given filename. If this fails, it tries to change
249         the filename slightly, step by step, until it's either able to open it
250         or it fails and raises a final exception, like the standard open()
251         function.
252
253         It returns the tuple (stream, definitive_file_name).
254         """
255         try:
256                 if filename == u'-':
257                         if sys.platform == 'win32':
258                                 import msvcrt
259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260                         return (sys.stdout, filename)
261                 stream = open(filename, open_mode)
262                 return (stream, filename)
263         except (IOError, OSError), err:
264                 # In case of error, try to remove win32 forbidden chars
265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
266
267                 # An exception here should be caught in the caller
268                 stream = open(filename, open_mode)
269                 return (stream, filename)
270
271
272 def timeconvert(timestr):
273         """Convert RFC 2822 defined time string into system timestamp"""
274         timestamp = None
275         timetuple = email.utils.parsedate_tz(timestr)
276         if timetuple is not None:
277                 timestamp = email.utils.mktime_tz(timetuple)
278         return timestamp
279
280
281 class DownloadError(Exception):
282         """Download Error exception.
283
284         This exception may be thrown by FileDownloader objects if they are not
285         configured to continue on errors. They will contain the appropriate
286         error message.
287         """
288         pass
289
290
291 class SameFileError(Exception):
292         """Same File exception.
293
294         This exception will be thrown by FileDownloader objects if they detect
295         multiple files would have to be downloaded to the same file on disk.
296         """
297         pass
298
299
300 class PostProcessingError(Exception):
301         """Post Processing exception.
302
303         This exception may be raised by PostProcessor's .run() method to
304         indicate an error in the postprocessing task.
305         """
306         pass
307
308
309 class UnavailableVideoError(Exception):
310         """Unavailable Format exception.
311
312         This exception will be thrown when a video is requested
313         in a format that is not available for that video.
314         """
315         pass
316
317
318 class ContentTooShortError(Exception):
319         """Content Too Short exception.
320
321         This exception may be raised by FileDownloader objects when a file they
322         download is too small for what the server announced first, indicating
323         the connection was probably interrupted.
324         """
325         # Both in bytes
326         downloaded = None
327         expected = None
328
329         def __init__(self, downloaded, expected):
330                 self.downloaded = downloaded
331                 self.expected = expected
332
333
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335         """Handler for HTTP requests and responses.
336
337         This class, when installed with an OpenerDirector, automatically adds
338         the standard headers to every HTTP request and handles gzipped and
339         deflated responses from web servers. If compression is to be avoided in
340         a particular request, the original request in the program code only has
341         to include the HTTP header "Youtubedl-No-Compression", which will be
342         removed before making the real request.
343
344         Part of this code was copied from:
345
346         http://techknack.net/python-urllib2-handlers/
347
348         Andrew Rowls, the author of that code, agreed to release it to the
349         public domain.
350         """
351
352         @staticmethod
353         def deflate(data):
354                 try:
355                         return zlib.decompress(data, -zlib.MAX_WBITS)
356                 except zlib.error:
357                         return zlib.decompress(data)
358
359         @staticmethod
360         def addinfourl_wrapper(stream, headers, url, code):
361                 if hasattr(urllib2.addinfourl, 'getcode'):
362                         return urllib2.addinfourl(stream, headers, url, code)
363                 ret = urllib2.addinfourl(stream, headers, url)
364                 ret.code = code
365                 return ret
366
367         def http_request(self, req):
368                 for h in std_headers:
369                         if h in req.headers:
370                                 del req.headers[h]
371                         req.add_header(h, std_headers[h])
372                 if 'Youtubedl-no-compression' in req.headers:
373                         if 'Accept-encoding' in req.headers:
374                                 del req.headers['Accept-encoding']
375                         del req.headers['Youtubedl-no-compression']
376                 return req
377
378         def http_response(self, req, resp):
379                 old_resp = resp
380                 # gzip
381                 if resp.headers.get('Content-encoding', '') == 'gzip':
382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384                         resp.msg = old_resp.msg
385                 # deflate
386                 if resp.headers.get('Content-encoding', '') == 'deflate':
387                         gz = StringIO.StringIO(self.deflate(resp.read()))
388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389                         resp.msg = old_resp.msg
390                 return resp
391
392
393 class FileDownloader(object):
394         """File Downloader class.
395
396         File downloader objects are the ones responsible of downloading the
397         actual video file and writing it to disk if the user has requested
398         it, among some other tasks. In most cases there should be one per
399         program. As, given a video URL, the downloader doesn't know how to
400         extract all the needed information, task that InfoExtractors do, it
401         has to pass the URL to one of them.
402
403         For this, file downloader objects have a method that allows
404         InfoExtractors to be registered in a given order. When it is passed
405         a URL, the file downloader handles it to the first InfoExtractor it
406         finds that reports being able to handle it. The InfoExtractor extracts
407         all the information about the video or videos the URL refers to, and
408         asks the FileDownloader to process the video information, possibly
409         downloading the video.
410
411         File downloaders accept a lot of parameters. In order not to saturate
412         the object constructor with arguments, it receives a dictionary of
413         options instead. These options are available through the params
414         attribute for the InfoExtractors to use. The FileDownloader also
415         registers itself as the downloader in charge for the InfoExtractors
416         that are added to it, so this is a "mutual registration".
417
418         Available options:
419
420         username:         Username for authentication purposes.
421         password:         Password for authentication purposes.
422         usenetrc:         Use netrc for authentication instead.
423         quiet:            Do not print messages to stdout.
424         forceurl:         Force printing final URL.
425         forcetitle:       Force printing title.
426         forcethumbnail:   Force printing thumbnail URL.
427         forcedescription: Force printing description.
428         forcefilename:    Force printing final filename.
429         simulate:         Do not download the video files.
430         format:           Video format code.
431         format_limit:     Highest quality format to try.
432         outtmpl:          Template for output names.
433         ignoreerrors:     Do not stop on download errors.
434         ratelimit:        Download speed limit, in bytes/sec.
435         nooverwrites:     Prevent overwriting files.
436         retries:          Number of times to retry for HTTP error 5xx
437         continuedl:       Try to continue downloads if possible.
438         noprogress:       Do not print the progress bar.
439         playliststart:    Playlist item to start at.
440         playlistend:      Playlist item to end at.
441         matchtitle:       Download only matching titles.
442         rejecttitle:      Reject downloads for matching titles.
443         logtostderr:      Log messages to stderr instead of stdout.
444         consoletitle:     Display progress in console window's titlebar.
445         nopart:           Do not use temporary .part files.
446         updatetime:       Use the Last-modified header to set output file timestamps.
447         writedescription: Write the video description to a .description file
448         writeinfojson:    Write the video description to a .info.json file
449         """
450
451         params = None
452         _ies = []
453         _pps = []
454         _download_retcode = None
455         _num_downloads = None
456         _screen_file = None
457
458         def __init__(self, params):
459                 """Create a FileDownloader object with the given options."""
460                 self._ies = []
461                 self._pps = []
462                 self._download_retcode = 0
463                 self._num_downloads = 0
464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
465                 self.params = params
466
467         @staticmethod
468         def format_bytes(bytes):
469                 if bytes is None:
470                         return 'N/A'
471                 if type(bytes) is str:
472                         bytes = float(bytes)
473                 if bytes == 0.0:
474                         exponent = 0
475                 else:
476                         exponent = long(math.log(bytes, 1024.0))
477                 suffix = 'bkMGTPEZY'[exponent]
478                 converted = float(bytes) / float(1024 ** exponent)
479                 return '%.2f%s' % (converted, suffix)
480
481         @staticmethod
482         def calc_percent(byte_counter, data_len):
483                 if data_len is None:
484                         return '---.-%'
485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
487         @staticmethod
488         def calc_eta(start, now, total, current):
489                 if total is None:
490                         return '--:--'
491                 dif = now - start
492                 if current == 0 or dif < 0.001: # One millisecond
493                         return '--:--'
494                 rate = float(current) / dif
495                 eta = long((float(total) - float(current)) / rate)
496                 (eta_mins, eta_secs) = divmod(eta, 60)
497                 if eta_mins > 99:
498                         return '--:--'
499                 return '%02d:%02d' % (eta_mins, eta_secs)
500
501         @staticmethod
502         def calc_speed(start, now, bytes):
503                 dif = now - start
504                 if bytes == 0 or dif < 0.001: # One millisecond
505                         return '%10s' % '---b/s'
506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
508         @staticmethod
509         def best_block_size(elapsed_time, bytes):
510                 new_min = max(bytes / 2.0, 1.0)
511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512                 if elapsed_time < 0.001:
513                         return long(new_max)
514                 rate = bytes / elapsed_time
515                 if rate > new_max:
516                         return long(new_max)
517                 if rate < new_min:
518                         return long(new_min)
519                 return long(rate)
520
521         @staticmethod
522         def parse_bytes(bytestr):
523                 """Parse a string indicating a byte quantity into a long integer."""
524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525                 if matchobj is None:
526                         return None
527                 number = float(matchobj.group(1))
528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529                 return long(round(number * multiplier))
530
531         def add_info_extractor(self, ie):
532                 """Add an InfoExtractor object to the end of the list."""
533                 self._ies.append(ie)
534                 ie.set_downloader(self)
535
536         def add_post_processor(self, pp):
537                 """Add a PostProcessor object to the end of the chain."""
538                 self._pps.append(pp)
539                 pp.set_downloader(self)
540
541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542                 """Print message to stdout if not in quiet mode."""
543                 try:
544                         if not self.params.get('quiet', False):
545                                 terminator = [u'\n', u''][skip_eol]
546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547                         self._screen_file.flush()
548                 except (UnicodeEncodeError), err:
549                         if not ignore_encoding_errors:
550                                 raise
551
552         def to_stderr(self, message):
553                 """Print message to stderr."""
554                 print >>sys.stderr, message.encode(preferredencoding())
555
556         def to_cons_title(self, message):
557                 """Set console/terminal window title to message."""
558                 if not self.params.get('consoletitle', False):
559                         return
560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561                         # c_wchar_p() might not be necessary if `message` is
562                         # already of type unicode()
563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564                 elif 'TERM' in os.environ:
565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
567         def fixed_template(self):
568                 """Checks if the output template is fixed."""
569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
570
571         def trouble(self, message=None):
572                 """Determine action to take when a download problem appears.
573
574                 Depending on if the downloader has been configured to ignore
575                 download errors or not, this method may throw an exception or
576                 not when errors are found, after printing the message.
577                 """
578                 if message is not None:
579                         self.to_stderr(message)
580                 if not self.params.get('ignoreerrors', False):
581                         raise DownloadError(message)
582                 self._download_retcode = 1
583
584         def slow_down(self, start_time, byte_counter):
585                 """Sleep if the download speed is over the rate limit."""
586                 rate_limit = self.params.get('ratelimit', None)
587                 if rate_limit is None or byte_counter == 0:
588                         return
589                 now = time.time()
590                 elapsed = now - start_time
591                 if elapsed <= 0.0:
592                         return
593                 speed = float(byte_counter) / elapsed
594                 if speed > rate_limit:
595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
596
597         def temp_name(self, filename):
598                 """Returns a temporary filename for the given filename."""
599                 if self.params.get('nopart', False) or filename == u'-' or \
600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
601                         return filename
602                 return filename + u'.part'
603
604         def undo_temp_name(self, filename):
605                 if filename.endswith(u'.part'):
606                         return filename[:-len(u'.part')]
607                 return filename
608
609         def try_rename(self, old_filename, new_filename):
610                 try:
611                         if old_filename == new_filename:
612                                 return
613                         os.rename(old_filename, new_filename)
614                 except (IOError, OSError), err:
615                         self.trouble(u'ERROR: unable to rename file')
616
617         def try_utime(self, filename, last_modified_hdr):
618                 """Try to set the last-modified time of the given file."""
619                 if last_modified_hdr is None:
620                         return
621                 if not os.path.isfile(filename):
622                         return
623                 timestr = last_modified_hdr
624                 if timestr is None:
625                         return
626                 filetime = timeconvert(timestr)
627                 if filetime is None:
628                         return
629                 try:
630                         os.utime(filename, (time.time(), filetime))
631                 except:
632                         pass
633
634         def report_writedescription(self, descfn):
635                 """ Report that the description file is being written """
636                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
637
638         def report_writeinfojson(self, infofn):
639                 """ Report that the metadata file has been written """
640                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
641
642         def report_destination(self, filename):
643                 """Report destination filename."""
644                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
645
646         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
647                 """Report download progress."""
648                 if self.params.get('noprogress', False):
649                         return
650                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
651                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
652                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
653                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
654
655         def report_resuming_byte(self, resume_len):
656                 """Report attempt to resume at given byte."""
657                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
658
659         def report_retry(self, count, retries):
660                 """Report retry in case of HTTP error 5xx"""
661                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
662
663         def report_file_already_downloaded(self, file_name):
664                 """Report file has already been fully downloaded."""
665                 try:
666                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
667                 except (UnicodeEncodeError), err:
668                         self.to_screen(u'[download] The file has already been downloaded')
669
670         def report_unable_to_resume(self):
671                 """Report it was impossible to resume download."""
672                 self.to_screen(u'[download] Unable to resume')
673
674         def report_finish(self):
675                 """Report download finished."""
676                 if self.params.get('noprogress', False):
677                         self.to_screen(u'[download] Download completed')
678                 else:
679                         self.to_screen(u'')
680
681         def increment_downloads(self):
682                 """Increment the ordinal that assigns a number to each file."""
683                 self._num_downloads += 1
684
685         def prepare_filename(self, info_dict):
686                 """Generate the output filename."""
687                 try:
688                         template_dict = dict(info_dict)
689                         template_dict['epoch'] = unicode(long(time.time()))
690                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
691                         filename = self.params['outtmpl'] % template_dict
692                         return filename
693                 except (ValueError, KeyError), err:
694                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
695                         return None
696
697         def process_info(self, info_dict):
698                 """Process a single dictionary returned by an InfoExtractor."""
699                 filename = self.prepare_filename(info_dict)
700                 # Do nothing else if in simulate mode
701                 if self.params.get('simulate', False):
702                         # Forced printings
703                         if self.params.get('forcetitle', False):
704                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705                         if self.params.get('forceurl', False):
706                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709                         if self.params.get('forcedescription', False) and 'description' in info_dict:
710                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711                         if self.params.get('forcefilename', False) and filename is not None:
712                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713
714                         return
715
716                 if filename is None:
717                         return
718
719                 matchtitle=self.params.get('matchtitle',False)
720                 rejecttitle=self.params.get('rejecttitle',False)
721                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
722                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
723                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
724                         return
725                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
726                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
727                         return
728                         
729                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
730                         self.to_stderr(u'WARNING: file exists and will be skipped')
731                         return
732
733                 try:
734                         dn = os.path.dirname(filename)
735                         if dn != '' and not os.path.exists(dn):
736                                 os.makedirs(dn)
737                 except (OSError, IOError), err:
738                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
739                         return
740
741                 if self.params.get('writedescription', False):
742                         try:
743                                 descfn = filename + '.description'
744                                 self.report_writedescription(descfn)
745                                 descfile = open(descfn, 'wb')
746                                 try:
747                                         descfile.write(info_dict['description'].encode('utf-8'))
748                                 finally:
749                                         descfile.close()
750                         except (OSError, IOError):
751                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
752                                 return
753
754                 if self.params.get('writeinfojson', False):
755                         infofn = filename + '.info.json'
756                         self.report_writeinfojson(infofn)
757                         try:
758                                 json.dump
759                         except (NameError,AttributeError):
760                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
761                                 return
762                         try:
763                                 infof = open(infofn, 'wb')
764                                 try:
765                                         json.dump(info_dict, infof)
766                                 finally:
767                                         infof.close()
768                         except (OSError, IOError):
769                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
770                                 return
771
772                 try:
773                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
774                 except (OSError, IOError), err:
775                         raise UnavailableVideoError
776                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
777                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
778                         return
779                 except (ContentTooShortError, ), err:
780                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
781                         return
782
783                 if success:
784                         try:
785                                 self.post_process(filename, info_dict)
786                         except (PostProcessingError), err:
787                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
788                                 return
789
790         def download(self, url_list):
791                 """Download a given list of URLs."""
792                 if len(url_list) > 1 and self.fixed_template():
793                         raise SameFileError(self.params['outtmpl'])
794
795                 for url in url_list:
796                         suitable_found = False
797                         for ie in self._ies:
798                                 # Go to next InfoExtractor if not suitable
799                                 if not ie.suitable(url):
800                                         continue
801
802                                 # Suitable InfoExtractor found
803                                 suitable_found = True
804
805                                 # Extract information from URL and process it
806                                 ie.extract(url)
807
808                                 # Suitable InfoExtractor had been found; go to next URL
809                                 break
810
811                         if not suitable_found:
812                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
813
814                 return self._download_retcode
815
816         def post_process(self, filename, ie_info):
817                 """Run the postprocessing chain on the given file."""
818                 info = dict(ie_info)
819                 info['filepath'] = filename
820                 for pp in self._pps:
821                         info = pp.run(info)
822                         if info is None:
823                                 break
824
825         def _download_with_rtmpdump(self, filename, url, player_url):
826                 self.report_destination(filename)
827                 tmpfilename = self.temp_name(filename)
828
829                 # Check for rtmpdump first
830                 try:
831                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
832                 except (OSError, IOError):
833                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
834                         return False
835
836                 # Download using rtmpdump. rtmpdump returns exit code 2 when
837                 # the connection was interrumpted and resuming appears to be
838                 # possible. This is part of rtmpdump's normal usage, AFAIK.
839                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
840                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
841                 while retval == 2 or retval == 1:
842                         prevsize = os.path.getsize(tmpfilename)
843                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
844                         time.sleep(5.0) # This seems to be needed
845                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
846                         cursize = os.path.getsize(tmpfilename)
847                         if prevsize == cursize and retval == 1:
848                                 break
849                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
850                         if prevsize == cursize and retval == 2 and cursize > 1024:
851                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
852                                 retval = 0
853                                 break
854                 if retval == 0:
855                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
856                         self.try_rename(tmpfilename, filename)
857                         return True
858                 else:
859                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
860                         return False
861
862         def _do_download(self, filename, url, player_url):
863                 # Check file already present
864                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
865                         self.report_file_already_downloaded(filename)
866                         return True
867
868                 # Attempt to download using rtmpdump
869                 if url.startswith('rtmp'):
870                         return self._download_with_rtmpdump(filename, url, player_url)
871
872                 tmpfilename = self.temp_name(filename)
873                 stream = None
874                 open_mode = 'wb'
875
876                 # Do not include the Accept-Encoding header
877                 headers = {'Youtubedl-no-compression': 'True'}
878                 basic_request = urllib2.Request(url, None, headers)
879                 request = urllib2.Request(url, None, headers)
880
881                 # Establish possible resume length
882                 if os.path.isfile(tmpfilename):
883                         resume_len = os.path.getsize(tmpfilename)
884                 else:
885                         resume_len = 0
886
887                 # Request parameters in case of being able to resume
888                 if self.params.get('continuedl', False) and resume_len != 0:
889                         self.report_resuming_byte(resume_len)
890                         request.add_header('Range', 'bytes=%d-' % resume_len)
891                         open_mode = 'ab'
892
893                 count = 0
894                 retries = self.params.get('retries', 0)
895                 while count <= retries:
896                         # Establish connection
897                         try:
898                                 data = urllib2.urlopen(request)
899                                 break
900                         except (urllib2.HTTPError, ), err:
901                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
902                                         # Unexpected HTTP error
903                                         raise
904                                 elif err.code == 416:
905                                         # Unable to resume (requested range not satisfiable)
906                                         try:
907                                                 # Open the connection again without the range header
908                                                 data = urllib2.urlopen(basic_request)
909                                                 content_length = data.info()['Content-Length']
910                                         except (urllib2.HTTPError, ), err:
911                                                 if err.code < 500 or err.code >= 600:
912                                                         raise
913                                         else:
914                                                 # Examine the reported length
915                                                 if (content_length is not None and
916                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
917                                                         # The file had already been fully downloaded.
918                                                         # Explanation to the above condition: in issue #175 it was revealed that
919                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
920                                                         # changing the file size slightly and causing problems for some users. So
921                                                         # I decided to implement a suggested change and consider the file
922                                                         # completely downloaded if the file size differs less than 100 bytes from
923                                                         # the one in the hard drive.
924                                                         self.report_file_already_downloaded(filename)
925                                                         self.try_rename(tmpfilename, filename)
926                                                         return True
927                                                 else:
928                                                         # The length does not match, we start the download over
929                                                         self.report_unable_to_resume()
930                                                         open_mode = 'wb'
931                                                         break
932                         # Retry
933                         count += 1
934                         if count <= retries:
935                                 self.report_retry(count, retries)
936
937                 if count > retries:
938                         self.trouble(u'ERROR: giving up after %s retries' % retries)
939                         return False
940
941                 data_len = data.info().get('Content-length', None)
942                 if data_len is not None:
943                         data_len = long(data_len) + resume_len
944                 data_len_str = self.format_bytes(data_len)
945                 byte_counter = 0 + resume_len
946                 block_size = 1024
947                 start = time.time()
948                 while True:
949                         # Download and write
950                         before = time.time()
951                         data_block = data.read(block_size)
952                         after = time.time()
953                         if len(data_block) == 0:
954                                 break
955                         byte_counter += len(data_block)
956
957                         # Open file just in time
958                         if stream is None:
959                                 try:
960                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
961                                         assert stream is not None
962                                         filename = self.undo_temp_name(tmpfilename)
963                                         self.report_destination(filename)
964                                 except (OSError, IOError), err:
965                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
966                                         return False
967                         try:
968                                 stream.write(data_block)
969                         except (IOError, OSError), err:
970                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
971                                 return False
972                         block_size = self.best_block_size(after - before, len(data_block))
973
974                         # Progress message
975                         percent_str = self.calc_percent(byte_counter, data_len)
976                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
977                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
978                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
979
980                         # Apply rate limit
981                         self.slow_down(start, byte_counter - resume_len)
982
983                 if stream is None:
984                         self.trouble(u'\nERROR: Did not get any data blocks')
985                         return False
986                 stream.close()
987                 self.report_finish()
988                 if data_len is not None and byte_counter != data_len:
989                         raise ContentTooShortError(byte_counter, long(data_len))
990                 self.try_rename(tmpfilename, filename)
991
992                 # Update file modification time
993                 if self.params.get('updatetime', True):
994                         self.try_utime(filename, data.info().get('last-modified', None))
995
996                 return True
997
998
999 class InfoExtractor(object):
1000         """Information Extractor class.
1001
1002         Information extractors are the classes that, given a URL, extract
1003         information from the video (or videos) the URL refers to. This
1004         information includes the real video URL, the video title and simplified
1005         title, author and others. The information is stored in a dictionary
1006         which is then passed to the FileDownloader. The FileDownloader
1007         processes this information possibly downloading the video to the file
1008         system, among other possible outcomes. The dictionaries must include
1009         the following fields:
1010
1011         id:             Video identifier.
1012         url:            Final video URL.
1013         uploader:       Nickname of the video uploader.
1014         title:          Literal title.
1015         stitle:         Simplified title.
1016         ext:            Video filename extension.
1017         format:         Video format.
1018         player_url:     SWF Player URL (may be None).
1019
1020         The following fields are optional. Their primary purpose is to allow
1021         youtube-dl to serve as the backend for a video search function, such
1022         as the one in youtube2mp3.  They are only used when their respective
1023         forced printing functions are called:
1024
1025         thumbnail:      Full URL to a video thumbnail image.
1026         description:    One-line video description.
1027
1028         Subclasses of this one should re-define the _real_initialize() and
1029         _real_extract() methods and define a _VALID_URL regexp.
1030         Probably, they should also be added to the list of extractors.
1031         """
1032
1033         _ready = False
1034         _downloader = None
1035
1036         def __init__(self, downloader=None):
1037                 """Constructor. Receives an optional downloader."""
1038                 self._ready = False
1039                 self.set_downloader(downloader)
1040
1041         def suitable(self, url):
1042                 """Receives a URL and returns True if suitable for this IE."""
1043                 return re.match(self._VALID_URL, url) is not None
1044
1045         def initialize(self):
1046                 """Initializes an instance (authentication, etc)."""
1047                 if not self._ready:
1048                         self._real_initialize()
1049                         self._ready = True
1050
1051         def extract(self, url):
1052                 """Extracts URL information and returns it in list of dicts."""
1053                 self.initialize()
1054                 return self._real_extract(url)
1055
1056         def set_downloader(self, downloader):
1057                 """Sets the downloader for this IE."""
1058                 self._downloader = downloader
1059
1060         def _real_initialize(self):
1061                 """Real initialization process. Redefine in subclasses."""
1062                 pass
1063
1064         def _real_extract(self, url):
1065                 """Real extraction process. Redefine in subclasses."""
1066                 pass
1067
1068
1069 class YoutubeIE(InfoExtractor):
1070         """Information extractor for youtube.com."""
1071
1072         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1073         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1074         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1075         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1076         _NETRC_MACHINE = 'youtube'
1077         # Listed in order of quality
1078         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1079         _video_extensions = {
1080                 '13': '3gp',
1081                 '17': 'mp4',
1082                 '18': 'mp4',
1083                 '22': 'mp4',
1084                 '37': 'mp4',
1085                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1086                 '43': 'webm',
1087                 '45': 'webm',
1088         }
1089         IE_NAME = u'youtube'
1090
1091         def report_lang(self):
1092                 """Report attempt to set language."""
1093                 self._downloader.to_screen(u'[youtube] Setting language')
1094
1095         def report_login(self):
1096                 """Report attempt to log in."""
1097                 self._downloader.to_screen(u'[youtube] Logging in')
1098
1099         def report_age_confirmation(self):
1100                 """Report attempt to confirm age."""
1101                 self._downloader.to_screen(u'[youtube] Confirming age')
1102
1103         def report_video_webpage_download(self, video_id):
1104                 """Report attempt to download video webpage."""
1105                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1106
1107         def report_video_info_webpage_download(self, video_id):
1108                 """Report attempt to download video info webpage."""
1109                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1110
1111         def report_information_extraction(self, video_id):
1112                 """Report attempt to extract video information."""
1113                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1114
1115         def report_unavailable_format(self, video_id, format):
1116                 """Report extracted video URL."""
1117                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1118
1119         def report_rtmp_download(self):
1120                 """Indicate the download will use the RTMP protocol."""
1121                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1122
1123         def _real_initialize(self):
1124                 if self._downloader is None:
1125                         return
1126
1127                 username = None
1128                 password = None
1129                 downloader_params = self._downloader.params
1130
1131                 # Attempt to use provided username and password or .netrc data
1132                 if downloader_params.get('username', None) is not None:
1133                         username = downloader_params['username']
1134                         password = downloader_params['password']
1135                 elif downloader_params.get('usenetrc', False):
1136                         try:
1137                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1138                                 if info is not None:
1139                                         username = info[0]
1140                                         password = info[2]
1141                                 else:
1142                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1143                         except (IOError, netrc.NetrcParseError), err:
1144                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1145                                 return
1146
1147                 # Set language
1148                 request = urllib2.Request(self._LANG_URL)
1149                 try:
1150                         self.report_lang()
1151                         urllib2.urlopen(request).read()
1152                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1153                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1154                         return
1155
1156                 # No authentication to be performed
1157                 if username is None:
1158                         return
1159
1160                 # Log in
1161                 login_form = {
1162                                 'current_form': 'loginForm',
1163                                 'next':         '/',
1164                                 'action_login': 'Log In',
1165                                 'username':     username,
1166                                 'password':     password,
1167                                 }
1168                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1169                 try:
1170                         self.report_login()
1171                         login_results = urllib2.urlopen(request).read()
1172                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1173                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1174                                 return
1175                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1176                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1177                         return
1178
1179                 # Confirm age
1180                 age_form = {
1181                                 'next_url':             '/',
1182                                 'action_confirm':       'Confirm',
1183                                 }
1184                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1185                 try:
1186                         self.report_age_confirmation()
1187                         age_results = urllib2.urlopen(request).read()
1188                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1189                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1190                         return
1191
1192         def _real_extract(self, url):
1193                 # Extract video id from URL
1194                 mobj = re.match(self._VALID_URL, url)
1195                 if mobj is None:
1196                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1197                         return
1198                 video_id = mobj.group(2)
1199
1200                 # Get video webpage
1201                 self.report_video_webpage_download(video_id)
1202                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1203                 try:
1204                         video_webpage = urllib2.urlopen(request).read()
1205                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1206                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1207                         return
1208
1209                 # Attempt to extract SWF player URL
1210                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1211                 if mobj is not None:
1212                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1213                 else:
1214                         player_url = None
1215
1216                 # Get video info
1217                 self.report_video_info_webpage_download(video_id)
1218                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1219                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1220                                         % (video_id, el_type))
1221                         request = urllib2.Request(video_info_url)
1222                         try:
1223                                 video_info_webpage = urllib2.urlopen(request).read()
1224                                 video_info = parse_qs(video_info_webpage)
1225                                 if 'token' in video_info:
1226                                         break
1227                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1229                                 return
1230                 if 'token' not in video_info:
1231                         if 'reason' in video_info:
1232                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1233                         else:
1234                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1235                         return
1236
1237                 # Start extracting information
1238                 self.report_information_extraction(video_id)
1239
1240                 # uploader
1241                 if 'author' not in video_info:
1242                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1243                         return
1244                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1245
1246                 # title
1247                 if 'title' not in video_info:
1248                         self._downloader.trouble(u'ERROR: unable to extract video title')
1249                         return
1250                 video_title = urllib.unquote_plus(video_info['title'][0])
1251                 video_title = video_title.decode('utf-8')
1252                 video_title = sanitize_title(video_title)
1253
1254                 # simplified title
1255                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1256                 simple_title = simple_title.strip(ur'_')
1257
1258                 # thumbnail image
1259                 if 'thumbnail_url' not in video_info:
1260                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1261                         video_thumbnail = ''
1262                 else:   # don't panic if we can't find it
1263                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1264
1265                 # upload date
1266                 upload_date = u'NA'
1267                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1268                 if mobj is not None:
1269                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1270                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1271                         for expression in format_expressions:
1272                                 try:
1273                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1274                                 except:
1275                                         pass
1276
1277                 # description
1278                 try:
1279                         lxml.etree
1280                 except NameError:
1281                         video_description = u'No description available.'
1282                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1283                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1284                                 if mobj is not None:
1285                                         video_description = mobj.group(1).decode('utf-8')
1286                 else:
1287                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1288                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1289                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1290                         # TODO use another parser
1291
1292                 # token
1293                 video_token = urllib.unquote_plus(video_info['token'][0])
1294
1295                 # Decide which formats to download
1296                 req_format = self._downloader.params.get('format', None)
1297
1298                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1299                         self.report_rtmp_download()
1300                         video_url_list = [(None, video_info['conn'][0])]
1301                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1302                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1303                         url_data = [parse_qs(uds) for uds in url_data_strs]
1304                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1305                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1306
1307                         format_limit = self._downloader.params.get('format_limit', None)
1308                         if format_limit is not None and format_limit in self._available_formats:
1309                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1310                         else:
1311                                 format_list = self._available_formats
1312                         existing_formats = [x for x in format_list if x in url_map]
1313                         if len(existing_formats) == 0:
1314                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1315                                 return
1316                         if req_format is None:
1317                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1318                         elif req_format == '-1':
1319                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1320                         else:
1321                                 # Specific format
1322                                 if req_format not in url_map:
1323                                         self._downloader.trouble(u'ERROR: requested format not available')
1324                                         return
1325                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1326                 else:
1327                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1328                         return
1329
1330                 for format_param, video_real_url in video_url_list:
1331                         # At this point we have a new video
1332                         self._downloader.increment_downloads()
1333
1334                         # Extension
1335                         video_extension = self._video_extensions.get(format_param, 'flv')
1336
1337                         try:
1338                                 # Process video information
1339                                 self._downloader.process_info({
1340                                         'id':           video_id.decode('utf-8'),
1341                                         'url':          video_real_url.decode('utf-8'),
1342                                         'uploader':     video_uploader.decode('utf-8'),
1343                                         'upload_date':  upload_date,
1344                                         'title':        video_title,
1345                                         'stitle':       simple_title,
1346                                         'ext':          video_extension.decode('utf-8'),
1347                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1348                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1349                                         'description':  video_description,
1350                                         'player_url':   player_url,
1351                                 })
1352                         except UnavailableVideoError, err:
1353                                 self._downloader.trouble(u'\nERROR: unable to download video')
1354
1355
1356 class MetacafeIE(InfoExtractor):
1357         """Information Extractor for metacafe.com."""
1358
1359         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1360         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1361         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1362         _youtube_ie = None
1363         IE_NAME = u'metacafe'
1364
1365         def __init__(self, youtube_ie, downloader=None):
1366                 InfoExtractor.__init__(self, downloader)
1367                 self._youtube_ie = youtube_ie
1368
1369         def report_disclaimer(self):
1370                 """Report disclaimer retrieval."""
1371                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1372
1373         def report_age_confirmation(self):
1374                 """Report attempt to confirm age."""
1375                 self._downloader.to_screen(u'[metacafe] Confirming age')
1376
1377         def report_download_webpage(self, video_id):
1378                 """Report webpage download."""
1379                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1380
1381         def report_extraction(self, video_id):
1382                 """Report information extraction."""
1383                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1384
1385         def _real_initialize(self):
1386                 # Retrieve disclaimer
1387                 request = urllib2.Request(self._DISCLAIMER)
1388                 try:
1389                         self.report_disclaimer()
1390                         disclaimer = urllib2.urlopen(request).read()
1391                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1392                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1393                         return
1394
1395                 # Confirm age
1396                 disclaimer_form = {
1397                         'filters': '0',
1398                         'submit': "Continue - I'm over 18",
1399                         }
1400                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1401                 try:
1402                         self.report_age_confirmation()
1403                         disclaimer = urllib2.urlopen(request).read()
1404                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1405                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1406                         return
1407
1408         def _real_extract(self, url):
1409                 # Extract id and simplified title from URL
1410                 mobj = re.match(self._VALID_URL, url)
1411                 if mobj is None:
1412                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1413                         return
1414
1415                 video_id = mobj.group(1)
1416
1417                 # Check if video comes from YouTube
1418                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1419                 if mobj2 is not None:
1420                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1421                         return
1422
1423                 # At this point we have a new video
1424                 self._downloader.increment_downloads()
1425
1426                 simple_title = mobj.group(2).decode('utf-8')
1427
1428                 # Retrieve video webpage to extract further information
1429                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1430                 try:
1431                         self.report_download_webpage(video_id)
1432                         webpage = urllib2.urlopen(request).read()
1433                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1434                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1435                         return
1436
1437                 # Extract URL, uploader and title from webpage
1438                 self.report_extraction(video_id)
1439                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1440                 if mobj is not None:
1441                         mediaURL = urllib.unquote(mobj.group(1))
1442                         video_extension = mediaURL[-3:]
1443
1444                         # Extract gdaKey if available
1445                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1446                         if mobj is None:
1447                                 video_url = mediaURL
1448                         else:
1449                                 gdaKey = mobj.group(1)
1450                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1451                 else:
1452                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1453                         if mobj is None:
1454                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1455                                 return
1456                         vardict = parse_qs(mobj.group(1))
1457                         if 'mediaData' not in vardict:
1458                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1459                                 return
1460                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1461                         if mobj is None:
1462                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1463                                 return
1464                         mediaURL = mobj.group(1).replace('\\/', '/')
1465                         video_extension = mediaURL[-3:]
1466                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1467
1468                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1469                 if mobj is None:
1470                         self._downloader.trouble(u'ERROR: unable to extract title')
1471                         return
1472                 video_title = mobj.group(1).decode('utf-8')
1473                 video_title = sanitize_title(video_title)
1474
1475                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1476                 if mobj is None:
1477                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1478                         return
1479                 video_uploader = mobj.group(1)
1480
1481                 try:
1482                         # Process video information
1483                         self._downloader.process_info({
1484                                 'id':           video_id.decode('utf-8'),
1485                                 'url':          video_url.decode('utf-8'),
1486                                 'uploader':     video_uploader.decode('utf-8'),
1487                                 'upload_date':  u'NA',
1488                                 'title':        video_title,
1489                                 'stitle':       simple_title,
1490                                 'ext':          video_extension.decode('utf-8'),
1491                                 'format':       u'NA',
1492                                 'player_url':   None,
1493                         })
1494                 except UnavailableVideoError:
1495                         self._downloader.trouble(u'\nERROR: unable to download video')
1496
1497
1498 class DailymotionIE(InfoExtractor):
1499         """Information Extractor for Dailymotion"""
1500
1501         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1502         IE_NAME = u'dailymotion'
1503
1504         def __init__(self, downloader=None):
1505                 InfoExtractor.__init__(self, downloader)
1506
1507         def report_download_webpage(self, video_id):
1508                 """Report webpage download."""
1509                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1510
1511         def report_extraction(self, video_id):
1512                 """Report information extraction."""
1513                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1514
1515         def _real_initialize(self):
1516                 return
1517
1518         def _real_extract(self, url):
1519                 # Extract id and simplified title from URL
1520                 mobj = re.match(self._VALID_URL, url)
1521                 if mobj is None:
1522                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1523                         return
1524
1525                 # At this point we have a new video
1526                 self._downloader.increment_downloads()
1527                 video_id = mobj.group(1)
1528
1529                 simple_title = mobj.group(2).decode('utf-8')
1530                 video_extension = 'flv'
1531
1532                 # Retrieve video webpage to extract further information
1533                 request = urllib2.Request(url)
1534                 request.add_header('Cookie', 'family_filter=off')
1535                 try:
1536                         self.report_download_webpage(video_id)
1537                         webpage = urllib2.urlopen(request).read()
1538                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1539                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1540                         return
1541
1542                 # Extract URL, uploader and title from webpage
1543                 self.report_extraction(video_id)
1544                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1545                 if mobj is None:
1546                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1547                         return
1548                 sequence = urllib.unquote(mobj.group(1))
1549                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1550                 if mobj is None:
1551                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1552                         return
1553                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1554
1555                 # if needed add http://www.dailymotion.com/ if relative URL
1556
1557                 video_url = mediaURL
1558
1559                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1560                 if mobj is None:
1561                         self._downloader.trouble(u'ERROR: unable to extract title')
1562                         return
1563                 video_title = mobj.group(1).decode('utf-8')
1564                 video_title = sanitize_title(video_title)
1565
1566                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1567                 if mobj is None:
1568                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1569                         return
1570                 video_uploader = mobj.group(1)
1571
1572                 try:
1573                         # Process video information
1574                         self._downloader.process_info({
1575                                 'id':           video_id.decode('utf-8'),
1576                                 'url':          video_url.decode('utf-8'),
1577                                 'uploader':     video_uploader.decode('utf-8'),
1578                                 'upload_date':  u'NA',
1579                                 'title':        video_title,
1580                                 'stitle':       simple_title,
1581                                 'ext':          video_extension.decode('utf-8'),
1582                                 'format':       u'NA',
1583                                 'player_url':   None,
1584                         })
1585                 except UnavailableVideoError:
1586                         self._downloader.trouble(u'\nERROR: unable to download video')
1587
1588
1589 class GoogleIE(InfoExtractor):
1590         """Information extractor for video.google.com."""
1591
1592         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1593         IE_NAME = u'video.google'
1594
1595         def __init__(self, downloader=None):
1596                 InfoExtractor.__init__(self, downloader)
1597
1598         def report_download_webpage(self, video_id):
1599                 """Report webpage download."""
1600                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1601
1602         def report_extraction(self, video_id):
1603                 """Report information extraction."""
1604                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1605
1606         def _real_initialize(self):
1607                 return
1608
1609         def _real_extract(self, url):
1610                 # Extract id from URL
1611                 mobj = re.match(self._VALID_URL, url)
1612                 if mobj is None:
1613                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1614                         return
1615
1616                 # At this point we have a new video
1617                 self._downloader.increment_downloads()
1618                 video_id = mobj.group(1)
1619
1620                 video_extension = 'mp4'
1621
1622                 # Retrieve video webpage to extract further information
1623                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1624                 try:
1625                         self.report_download_webpage(video_id)
1626                         webpage = urllib2.urlopen(request).read()
1627                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1628                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1629                         return
1630
1631                 # Extract URL, uploader, and title from webpage
1632                 self.report_extraction(video_id)
1633                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1634                 if mobj is None:
1635                         video_extension = 'flv'
1636                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1637                 if mobj is None:
1638                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1639                         return
1640                 mediaURL = urllib.unquote(mobj.group(1))
1641                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1642                 mediaURL = mediaURL.replace('\\x26', '\x26')
1643
1644                 video_url = mediaURL
1645
1646                 mobj = re.search(r'<title>(.*)</title>', webpage)
1647                 if mobj is None:
1648                         self._downloader.trouble(u'ERROR: unable to extract title')
1649                         return
1650                 video_title = mobj.group(1).decode('utf-8')
1651                 video_title = sanitize_title(video_title)
1652                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1653
1654                 # Extract video description
1655                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1656                 if mobj is None:
1657                         self._downloader.trouble(u'ERROR: unable to extract video description')
1658                         return
1659                 video_description = mobj.group(1).decode('utf-8')
1660                 if not video_description:
1661                         video_description = 'No description available.'
1662
1663                 # Extract video thumbnail
1664                 if self._downloader.params.get('forcethumbnail', False):
1665                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1666                         try:
1667                                 webpage = urllib2.urlopen(request).read()
1668                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1669                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1670                                 return
1671                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1672                         if mobj is None:
1673                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1674                                 return
1675                         video_thumbnail = mobj.group(1)
1676                 else:   # we need something to pass to process_info
1677                         video_thumbnail = ''
1678
1679                 try:
1680                         # Process video information
1681                         self._downloader.process_info({
1682                                 'id':           video_id.decode('utf-8'),
1683                                 'url':          video_url.decode('utf-8'),
1684                                 'uploader':     u'NA',
1685                                 'upload_date':  u'NA',
1686                                 'title':        video_title,
1687                                 'stitle':       simple_title,
1688                                 'ext':          video_extension.decode('utf-8'),
1689                                 'format':       u'NA',
1690                                 'player_url':   None,
1691                         })
1692                 except UnavailableVideoError:
1693                         self._downloader.trouble(u'\nERROR: unable to download video')
1694
1695
1696 class PhotobucketIE(InfoExtractor):
1697         """Information extractor for photobucket.com."""
1698
1699         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1700         IE_NAME = u'photobucket'
1701
1702         def __init__(self, downloader=None):
1703                 InfoExtractor.__init__(self, downloader)
1704
1705         def report_download_webpage(self, video_id):
1706                 """Report webpage download."""
1707                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1708
1709         def report_extraction(self, video_id):
1710                 """Report information extraction."""
1711                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1712
1713         def _real_initialize(self):
1714                 return
1715
1716         def _real_extract(self, url):
1717                 # Extract id from URL
1718                 mobj = re.match(self._VALID_URL, url)
1719                 if mobj is None:
1720                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1721                         return
1722
1723                 # At this point we have a new video
1724                 self._downloader.increment_downloads()
1725                 video_id = mobj.group(1)
1726
1727                 video_extension = 'flv'
1728
1729                 # Retrieve video webpage to extract further information
1730                 request = urllib2.Request(url)
1731                 try:
1732                         self.report_download_webpage(video_id)
1733                         webpage = urllib2.urlopen(request).read()
1734                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1735                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1736                         return
1737
1738                 # Extract URL, uploader, and title from webpage
1739                 self.report_extraction(video_id)
1740                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1741                 if mobj is None:
1742                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1743                         return
1744                 mediaURL = urllib.unquote(mobj.group(1))
1745
1746                 video_url = mediaURL
1747
1748                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1749                 if mobj is None:
1750                         self._downloader.trouble(u'ERROR: unable to extract title')
1751                         return
1752                 video_title = mobj.group(1).decode('utf-8')
1753                 video_title = sanitize_title(video_title)
1754                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1755
1756                 video_uploader = mobj.group(2).decode('utf-8')
1757
1758                 try:
1759                         # Process video information
1760                         self._downloader.process_info({
1761                                 'id':           video_id.decode('utf-8'),
1762                                 'url':          video_url.decode('utf-8'),
1763                                 'uploader':     video_uploader,
1764                                 'upload_date':  u'NA',
1765                                 'title':        video_title,
1766                                 'stitle':       simple_title,
1767                                 'ext':          video_extension.decode('utf-8'),
1768                                 'format':       u'NA',
1769                                 'player_url':   None,
1770                         })
1771                 except UnavailableVideoError:
1772                         self._downloader.trouble(u'\nERROR: unable to download video')
1773
1774
1775 class YahooIE(InfoExtractor):
1776         """Information extractor for video.yahoo.com."""
1777
1778         # _VALID_URL matches all Yahoo! Video URLs
1779         # _VPAGE_URL matches only the extractable '/watch/' URLs
1780         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1781         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1782         IE_NAME = u'video.yahoo'
1783
1784         def __init__(self, downloader=None):
1785                 InfoExtractor.__init__(self, downloader)
1786
1787         def report_download_webpage(self, video_id):
1788                 """Report webpage download."""
1789                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1790
1791         def report_extraction(self, video_id):
1792                 """Report information extraction."""
1793                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1794
1795         def _real_initialize(self):
1796                 return
1797
1798         def _real_extract(self, url, new_video=True):
1799                 # Extract ID from URL
1800                 mobj = re.match(self._VALID_URL, url)
1801                 if mobj is None:
1802                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1803                         return
1804
1805                 # At this point we have a new video
1806                 self._downloader.increment_downloads()
1807                 video_id = mobj.group(2)
1808                 video_extension = 'flv'
1809
1810                 # Rewrite valid but non-extractable URLs as
1811                 # extractable English language /watch/ URLs
1812                 if re.match(self._VPAGE_URL, url) is None:
1813                         request = urllib2.Request(url)
1814                         try:
1815                                 webpage = urllib2.urlopen(request).read()
1816                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1817                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1818                                 return
1819
1820                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1821                         if mobj is None:
1822                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1823                                 return
1824                         yahoo_id = mobj.group(1)
1825
1826                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1827                         if mobj is None:
1828                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1829                                 return
1830                         yahoo_vid = mobj.group(1)
1831
1832                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1833                         return self._real_extract(url, new_video=False)
1834
1835                 # Retrieve video webpage to extract further information
1836                 request = urllib2.Request(url)
1837                 try:
1838                         self.report_download_webpage(video_id)
1839                         webpage = urllib2.urlopen(request).read()
1840                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1841                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1842                         return
1843
1844                 # Extract uploader and title from webpage
1845                 self.report_extraction(video_id)
1846                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1847                 if mobj is None:
1848                         self._downloader.trouble(u'ERROR: unable to extract video title')
1849                         return
1850                 video_title = mobj.group(1).decode('utf-8')
1851                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1852
1853                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1854                 if mobj is None:
1855                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1856                         return
1857                 video_uploader = mobj.group(1).decode('utf-8')
1858
1859                 # Extract video thumbnail
1860                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1861                 if mobj is None:
1862                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1863                         return
1864                 video_thumbnail = mobj.group(1).decode('utf-8')
1865
1866                 # Extract video description
1867                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1868                 if mobj is None:
1869                         self._downloader.trouble(u'ERROR: unable to extract video description')
1870                         return
1871                 video_description = mobj.group(1).decode('utf-8')
1872                 if not video_description:
1873                         video_description = 'No description available.'
1874
1875                 # Extract video height and width
1876                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1877                 if mobj is None:
1878                         self._downloader.trouble(u'ERROR: unable to extract video height')
1879                         return
1880                 yv_video_height = mobj.group(1)
1881
1882                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1883                 if mobj is None:
1884                         self._downloader.trouble(u'ERROR: unable to extract video width')
1885                         return
1886                 yv_video_width = mobj.group(1)
1887
1888                 # Retrieve video playlist to extract media URL
1889                 # I'm not completely sure what all these options are, but we
1890                 # seem to need most of them, otherwise the server sends a 401.
1891                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1892                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1893                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1894                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1895                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1896                 try:
1897                         self.report_download_webpage(video_id)
1898                         webpage = urllib2.urlopen(request).read()
1899                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1900                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1901                         return
1902
1903                 # Extract media URL from playlist XML
1904                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1905                 if mobj is None:
1906                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1907                         return
1908                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1909                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1910
1911                 try:
1912                         # Process video information
1913                         self._downloader.process_info({
1914                                 'id':           video_id.decode('utf-8'),
1915                                 'url':          video_url,
1916                                 'uploader':     video_uploader,
1917                                 'upload_date':  u'NA',
1918                                 'title':        video_title,
1919                                 'stitle':       simple_title,
1920                                 'ext':          video_extension.decode('utf-8'),
1921                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1922                                 'description':  video_description,
1923                                 'thumbnail':    video_thumbnail,
1924                                 'player_url':   None,
1925                         })
1926                 except UnavailableVideoError:
1927                         self._downloader.trouble(u'\nERROR: unable to download video')
1928
1929
1930 class VimeoIE(InfoExtractor):
1931         """Information extractor for vimeo.com."""
1932
1933         # _VALID_URL matches Vimeo URLs
1934         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1935         IE_NAME = u'vimeo'
1936
1937         def __init__(self, downloader=None):
1938                 InfoExtractor.__init__(self, downloader)
1939
1940         def report_download_webpage(self, video_id):
1941                 """Report webpage download."""
1942                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1943
1944         def report_extraction(self, video_id):
1945                 """Report information extraction."""
1946                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1947
1948         def _real_initialize(self):
1949                 return
1950
1951         def _real_extract(self, url, new_video=True):
1952                 # Extract ID from URL
1953                 mobj = re.match(self._VALID_URL, url)
1954                 if mobj is None:
1955                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1956                         return
1957
1958                 # At this point we have a new video
1959                 self._downloader.increment_downloads()
1960                 video_id = mobj.group(1)
1961
1962                 # Retrieve video webpage to extract further information
1963                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1964                 try:
1965                         self.report_download_webpage(video_id)
1966                         webpage = urllib2.urlopen(request).read()
1967                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1968                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1969                         return
1970
1971                 # Now we begin extracting as much information as we can from what we
1972                 # retrieved. First we extract the information common to all extractors,
1973                 # and latter we extract those that are Vimeo specific.
1974                 self.report_extraction(video_id)
1975
1976                 # Extract title
1977                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1978                 if mobj is None:
1979                         self._downloader.trouble(u'ERROR: unable to extract video title')
1980                         return
1981                 video_title = mobj.group(1).decode('utf-8')
1982                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1983
1984                 # Extract uploader
1985                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1986                 if mobj is None:
1987                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1988                         return
1989                 video_uploader = mobj.group(1).decode('utf-8')
1990
1991                 # Extract video thumbnail
1992                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1993                 if mobj is None:
1994                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1995                         return
1996                 video_thumbnail = mobj.group(1).decode('utf-8')
1997
1998                 # # Extract video description
1999                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2000                 # if mobj is None:
2001                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2002                 #       return
2003                 # video_description = mobj.group(1).decode('utf-8')
2004                 # if not video_description: video_description = 'No description available.'
2005                 video_description = 'Foo.'
2006
2007                 # Vimeo specific: extract request signature
2008                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2009                 if mobj is None:
2010                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2011                         return
2012                 sig = mobj.group(1).decode('utf-8')
2013
2014                 # Vimeo specific: Extract request signature expiration
2015                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2016                 if mobj is None:
2017                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2018                         return
2019                 sig_exp = mobj.group(1).decode('utf-8')
2020
2021                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2022
2023                 try:
2024                         # Process video information
2025                         self._downloader.process_info({
2026                                 'id':           video_id.decode('utf-8'),
2027                                 'url':          video_url,
2028                                 'uploader':     video_uploader,
2029                                 'upload_date':  u'NA',
2030                                 'title':        video_title,
2031                                 'stitle':       simple_title,
2032                                 'ext':          u'mp4',
2033                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2034                                 'description':  video_description,
2035                                 'thumbnail':    video_thumbnail,
2036                                 'description':  video_description,
2037                                 'player_url':   None,
2038                         })
2039                 except UnavailableVideoError:
2040                         self._downloader.trouble(u'ERROR: unable to download video')
2041
2042
2043 class GenericIE(InfoExtractor):
2044         """Generic last-resort information extractor."""
2045
2046         _VALID_URL = r'.*'
2047         IE_NAME = u'generic'
2048
2049         def __init__(self, downloader=None):
2050                 InfoExtractor.__init__(self, downloader)
2051
2052         def report_download_webpage(self, video_id):
2053                 """Report webpage download."""
2054                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2055                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2056
2057         def report_extraction(self, video_id):
2058                 """Report information extraction."""
2059                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2060
2061         def _real_initialize(self):
2062                 return
2063
2064         def _real_extract(self, url):
2065                 # At this point we have a new video
2066                 self._downloader.increment_downloads()
2067
2068                 video_id = url.split('/')[-1]
2069                 request = urllib2.Request(url)
2070                 try:
2071                         self.report_download_webpage(video_id)
2072                         webpage = urllib2.urlopen(request).read()
2073                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2074                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2075                         return
2076                 except ValueError, err:
2077                         # since this is the last-resort InfoExtractor, if
2078                         # this error is thrown, it'll be thrown here
2079                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2080                         return
2081
2082                 self.report_extraction(video_id)
2083                 # Start with something easy: JW Player in SWFObject
2084                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2085                 if mobj is None:
2086                         # Broaden the search a little bit
2087                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2088                 if mobj is None:
2089                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2090                         return
2091
2092                 # It's possible that one of the regexes
2093                 # matched, but returned an empty group:
2094                 if mobj.group(1) is None:
2095                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2096                         return
2097
2098                 video_url = urllib.unquote(mobj.group(1))
2099                 video_id = os.path.basename(video_url)
2100
2101                 # here's a fun little line of code for you:
2102                 video_extension = os.path.splitext(video_id)[1][1:]
2103                 video_id = os.path.splitext(video_id)[0]
2104
2105                 # it's tempting to parse this further, but you would
2106                 # have to take into account all the variations like
2107                 #   Video Title - Site Name
2108                 #   Site Name | Video Title
2109                 #   Video Title - Tagline | Site Name
2110                 # and so on and so forth; it's just not practical
2111                 mobj = re.search(r'<title>(.*)</title>', webpage)
2112                 if mobj is None:
2113                         self._downloader.trouble(u'ERROR: unable to extract title')
2114                         return
2115                 video_title = mobj.group(1).decode('utf-8')
2116                 video_title = sanitize_title(video_title)
2117                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2118
2119                 # video uploader is domain name
2120                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2121                 if mobj is None:
2122                         self._downloader.trouble(u'ERROR: unable to extract title')
2123                         return
2124                 video_uploader = mobj.group(1).decode('utf-8')
2125
2126                 try:
2127                         # Process video information
2128                         self._downloader.process_info({
2129                                 'id':           video_id.decode('utf-8'),
2130                                 'url':          video_url.decode('utf-8'),
2131                                 'uploader':     video_uploader,
2132                                 'upload_date':  u'NA',
2133                                 'title':        video_title,
2134                                 'stitle':       simple_title,
2135                                 'ext':          video_extension.decode('utf-8'),
2136                                 'format':       u'NA',
2137                                 'player_url':   None,
2138                         })
2139                 except UnavailableVideoError, err:
2140                         self._downloader.trouble(u'\nERROR: unable to download video')
2141
2142
2143 class YoutubeSearchIE(InfoExtractor):
2144         """Information Extractor for YouTube search queries."""
2145         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2146         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2147         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2148         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2149         _youtube_ie = None
2150         _max_youtube_results = 1000
2151         IE_NAME = u'youtube:search'
2152
2153         def __init__(self, youtube_ie, downloader=None):
2154                 InfoExtractor.__init__(self, downloader)
2155                 self._youtube_ie = youtube_ie
2156
2157         def report_download_page(self, query, pagenum):
2158                 """Report attempt to download playlist page with given number."""
2159                 query = query.decode(preferredencoding())
2160                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2161
2162         def _real_initialize(self):
2163                 self._youtube_ie.initialize()
2164
2165         def _real_extract(self, query):
2166                 mobj = re.match(self._VALID_URL, query)
2167                 if mobj is None:
2168                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2169                         return
2170
2171                 prefix, query = query.split(':')
2172                 prefix = prefix[8:]
2173                 query = query.encode('utf-8')
2174                 if prefix == '':
2175                         self._download_n_results(query, 1)
2176                         return
2177                 elif prefix == 'all':
2178                         self._download_n_results(query, self._max_youtube_results)
2179                         return
2180                 else:
2181                         try:
2182                                 n = long(prefix)
2183                                 if n <= 0:
2184                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2185                                         return
2186                                 elif n > self._max_youtube_results:
2187                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2188                                         n = self._max_youtube_results
2189                                 self._download_n_results(query, n)
2190                                 return
2191                         except ValueError: # parsing prefix as integer fails
2192                                 self._download_n_results(query, 1)
2193                                 return
2194
2195         def _download_n_results(self, query, n):
2196                 """Downloads a specified number of results for a query"""
2197
2198                 video_ids = []
2199                 already_seen = set()
2200                 pagenum = 1
2201
2202                 while True:
2203                         self.report_download_page(query, pagenum)
2204                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2205                         request = urllib2.Request(result_url)
2206                         try:
2207                                 page = urllib2.urlopen(request).read()
2208                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2209                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2210                                 return
2211
2212                         # Extract video identifiers
2213                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2214                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2215                                 if video_id not in already_seen:
2216                                         video_ids.append(video_id)
2217                                         already_seen.add(video_id)
2218                                         if len(video_ids) == n:
2219                                                 # Specified n videos reached
2220                                                 for id in video_ids:
2221                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2222                                                 return
2223
2224                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2225                                 for id in video_ids:
2226                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2227                                 return
2228
2229                         pagenum = pagenum + 1
2230
2231
2232 class GoogleSearchIE(InfoExtractor):
2233         """Information Extractor for Google Video search queries."""
2234         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2235         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2236         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2237         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2238         _google_ie = None
2239         _max_google_results = 1000
2240         IE_NAME = u'video.google:search'
2241
2242         def __init__(self, google_ie, downloader=None):
2243                 InfoExtractor.__init__(self, downloader)
2244                 self._google_ie = google_ie
2245
2246         def report_download_page(self, query, pagenum):
2247                 """Report attempt to download playlist page with given number."""
2248                 query = query.decode(preferredencoding())
2249                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2250
2251         def _real_initialize(self):
2252                 self._google_ie.initialize()
2253
2254         def _real_extract(self, query):
2255                 mobj = re.match(self._VALID_URL, query)
2256                 if mobj is None:
2257                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2258                         return
2259
2260                 prefix, query = query.split(':')
2261                 prefix = prefix[8:]
2262                 query = query.encode('utf-8')
2263                 if prefix == '':
2264                         self._download_n_results(query, 1)
2265                         return
2266                 elif prefix == 'all':
2267                         self._download_n_results(query, self._max_google_results)
2268                         return
2269                 else:
2270                         try:
2271                                 n = long(prefix)
2272                                 if n <= 0:
2273                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2274                                         return
2275                                 elif n > self._max_google_results:
2276                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2277                                         n = self._max_google_results
2278                                 self._download_n_results(query, n)
2279                                 return
2280                         except ValueError: # parsing prefix as integer fails
2281                                 self._download_n_results(query, 1)
2282                                 return
2283
2284         def _download_n_results(self, query, n):
2285                 """Downloads a specified number of results for a query"""
2286
2287                 video_ids = []
2288                 already_seen = set()
2289                 pagenum = 1
2290
2291                 while True:
2292                         self.report_download_page(query, pagenum)
2293                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2294                         request = urllib2.Request(result_url)
2295                         try:
2296                                 page = urllib2.urlopen(request).read()
2297                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2298                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2299                                 return
2300
2301                         # Extract video identifiers
2302                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2303                                 video_id = mobj.group(1)
2304                                 if video_id not in already_seen:
2305                                         video_ids.append(video_id)
2306                                         already_seen.add(video_id)
2307                                         if len(video_ids) == n:
2308                                                 # Specified n videos reached
2309                                                 for id in video_ids:
2310                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2311                                                 return
2312
2313                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2314                                 for id in video_ids:
2315                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2316                                 return
2317
2318                         pagenum = pagenum + 1
2319
2320
2321 class YahooSearchIE(InfoExtractor):
2322         """Information Extractor for Yahoo! Video search queries."""
2323         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2324         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2325         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2326         _MORE_PAGES_INDICATOR = r'\s*Next'
2327         _yahoo_ie = None
2328         _max_yahoo_results = 1000
2329         IE_NAME = u'video.yahoo:search'
2330
2331         def __init__(self, yahoo_ie, downloader=None):
2332                 InfoExtractor.__init__(self, downloader)
2333                 self._yahoo_ie = yahoo_ie
2334
2335         def report_download_page(self, query, pagenum):
2336                 """Report attempt to download playlist page with given number."""
2337                 query = query.decode(preferredencoding())
2338                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2339
2340         def _real_initialize(self):
2341                 self._yahoo_ie.initialize()
2342
2343         def _real_extract(self, query):
2344                 mobj = re.match(self._VALID_URL, query)
2345                 if mobj is None:
2346                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2347                         return
2348
2349                 prefix, query = query.split(':')
2350                 prefix = prefix[8:]
2351                 query = query.encode('utf-8')
2352                 if prefix == '':
2353                         self._download_n_results(query, 1)
2354                         return
2355                 elif prefix == 'all':
2356                         self._download_n_results(query, self._max_yahoo_results)
2357                         return
2358                 else:
2359                         try:
2360                                 n = long(prefix)
2361                                 if n <= 0:
2362                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2363                                         return
2364                                 elif n > self._max_yahoo_results:
2365                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2366                                         n = self._max_yahoo_results
2367                                 self._download_n_results(query, n)
2368                                 return
2369                         except ValueError: # parsing prefix as integer fails
2370                                 self._download_n_results(query, 1)
2371                                 return
2372
2373         def _download_n_results(self, query, n):
2374                 """Downloads a specified number of results for a query"""
2375
2376                 video_ids = []
2377                 already_seen = set()
2378                 pagenum = 1
2379
2380                 while True:
2381                         self.report_download_page(query, pagenum)
2382                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2383                         request = urllib2.Request(result_url)
2384                         try:
2385                                 page = urllib2.urlopen(request).read()
2386                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2388                                 return
2389
2390                         # Extract video identifiers
2391                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2392                                 video_id = mobj.group(1)
2393                                 if video_id not in already_seen:
2394                                         video_ids.append(video_id)
2395                                         already_seen.add(video_id)
2396                                         if len(video_ids) == n:
2397                                                 # Specified n videos reached
2398                                                 for id in video_ids:
2399                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2400                                                 return
2401
2402                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2403                                 for id in video_ids:
2404                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2405                                 return
2406
2407                         pagenum = pagenum + 1
2408
2409
2410 class YoutubePlaylistIE(InfoExtractor):
2411         """Information Extractor for YouTube playlists."""
2412
2413         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2414         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2415         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2416         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2417         _youtube_ie = None
2418         IE_NAME = u'youtube:playlist'
2419
2420         def __init__(self, youtube_ie, downloader=None):
2421                 InfoExtractor.__init__(self, downloader)
2422                 self._youtube_ie = youtube_ie
2423
2424         def report_download_page(self, playlist_id, pagenum):
2425                 """Report attempt to download playlist page with given number."""
2426                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2427
2428         def _real_initialize(self):
2429                 self._youtube_ie.initialize()
2430
2431         def _real_extract(self, url):
2432                 # Extract playlist id
2433                 mobj = re.match(self._VALID_URL, url)
2434                 if mobj is None:
2435                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2436                         return
2437
2438                 # Single video case
2439                 if mobj.group(3) is not None:
2440                         self._youtube_ie.extract(mobj.group(3))
2441                         return
2442
2443                 # Download playlist pages
2444                 # prefix is 'p' as default for playlists but there are other types that need extra care
2445                 playlist_prefix = mobj.group(1)
2446                 if playlist_prefix == 'a':
2447                         playlist_access = 'artist'
2448                 else:
2449                         playlist_prefix = 'p'
2450                         playlist_access = 'view_play_list'
2451                 playlist_id = mobj.group(2)
2452                 video_ids = []
2453                 pagenum = 1
2454
2455                 while True:
2456                         self.report_download_page(playlist_id, pagenum)
2457                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2458                         try:
2459                                 page = urllib2.urlopen(request).read()
2460                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2461                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2462                                 return
2463
2464                         # Extract video identifiers
2465                         ids_in_page = []
2466                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2467                                 if mobj.group(1) not in ids_in_page:
2468                                         ids_in_page.append(mobj.group(1))
2469                         video_ids.extend(ids_in_page)
2470
2471                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2472                                 break
2473                         pagenum = pagenum + 1
2474
2475                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2476                 playlistend = self._downloader.params.get('playlistend', -1)
2477                 video_ids = video_ids[playliststart:playlistend]
2478
2479                 for id in video_ids:
2480                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2481                 return
2482
2483
2484 class YoutubeUserIE(InfoExtractor):
2485         """Information Extractor for YouTube users."""
2486
2487         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2488         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2489         _GDATA_PAGE_SIZE = 50
2490         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2491         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2492         _youtube_ie = None
2493         IE_NAME = u'youtube:user'
2494
2495         def __init__(self, youtube_ie, downloader=None):
2496                 InfoExtractor.__init__(self, downloader)
2497                 self._youtube_ie = youtube_ie
2498
2499         def report_download_page(self, username, start_index):
2500                 """Report attempt to download user page."""
2501                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2502                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2503
2504         def _real_initialize(self):
2505                 self._youtube_ie.initialize()
2506
2507         def _real_extract(self, url):
2508                 # Extract username
2509                 mobj = re.match(self._VALID_URL, url)
2510                 if mobj is None:
2511                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2512                         return
2513
2514                 username = mobj.group(1)
2515
2516                 # Download video ids using YouTube Data API. Result size per
2517                 # query is limited (currently to 50 videos) so we need to query
2518                 # page by page until there are no video ids - it means we got
2519                 # all of them.
2520
2521                 video_ids = []
2522                 pagenum = 0
2523
2524                 while True:
2525                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2526                         self.report_download_page(username, start_index)
2527
2528                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2529
2530                         try:
2531                                 page = urllib2.urlopen(request).read()
2532                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2533                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2534                                 return
2535
2536                         # Extract video identifiers
2537                         ids_in_page = []
2538
2539                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2540                                 if mobj.group(1) not in ids_in_page:
2541                                         ids_in_page.append(mobj.group(1))
2542
2543                         video_ids.extend(ids_in_page)
2544
2545                         # A little optimization - if current page is not
2546                         # "full", ie. does not contain PAGE_SIZE video ids then
2547                         # we can assume that this page is the last one - there
2548                         # are no more ids on further pages - no need to query
2549                         # again.
2550
2551                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2552                                 break
2553
2554                         pagenum += 1
2555
2556                 all_ids_count = len(video_ids)
2557                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2558                 playlistend = self._downloader.params.get('playlistend', -1)
2559
2560                 if playlistend == -1:
2561                         video_ids = video_ids[playliststart:]
2562                 else:
2563                         video_ids = video_ids[playliststart:playlistend]
2564
2565                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2566                                 (username, all_ids_count, len(video_ids)))
2567
2568                 for video_id in video_ids:
2569                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2570
2571
2572 class DepositFilesIE(InfoExtractor):
2573         """Information extractor for depositfiles.com"""
2574
2575         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2576         IE_NAME = u'DepositFiles'
2577
2578         def __init__(self, downloader=None):
2579                 InfoExtractor.__init__(self, downloader)
2580
2581         def report_download_webpage(self, file_id):
2582                 """Report webpage download."""
2583                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2584
2585         def report_extraction(self, file_id):
2586                 """Report information extraction."""
2587                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2588
2589         def _real_initialize(self):
2590                 return
2591
2592         def _real_extract(self, url):
2593                 # At this point we have a new file
2594                 self._downloader.increment_downloads()
2595
2596                 file_id = url.split('/')[-1]
2597                 # Rebuild url in english locale
2598                 url = 'http://depositfiles.com/en/files/' + file_id
2599
2600                 # Retrieve file webpage with 'Free download' button pressed
2601                 free_download_indication = { 'gateway_result' : '1' }
2602                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2603                 try:
2604                         self.report_download_webpage(file_id)
2605                         webpage = urllib2.urlopen(request).read()
2606                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2607                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2608                         return
2609
2610                 # Search for the real file URL
2611                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2612                 if (mobj is None) or (mobj.group(1) is None):
2613                         # Try to figure out reason of the error.
2614                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2615                         if (mobj is not None) and (mobj.group(1) is not None):
2616                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2617                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2618                         else:
2619                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2620                         return
2621
2622                 file_url = mobj.group(1)
2623                 file_extension = os.path.splitext(file_url)[1][1:]
2624
2625                 # Search for file title
2626                 mobj = re.search(r'<b title="(.*?)">', webpage)
2627                 if mobj is None:
2628                         self._downloader.trouble(u'ERROR: unable to extract title')
2629                         return
2630                 file_title = mobj.group(1).decode('utf-8')
2631
2632                 try:
2633                         # Process file information
2634                         self._downloader.process_info({
2635                                 'id':           file_id.decode('utf-8'),
2636                                 'url':          file_url.decode('utf-8'),
2637                                 'uploader':     u'NA',
2638                                 'upload_date':  u'NA',
2639                                 'title':        file_title,
2640                                 'stitle':       file_title,
2641                                 'ext':          file_extension.decode('utf-8'),
2642                                 'format':       u'NA',
2643                                 'player_url':   None,
2644                         })
2645                 except UnavailableVideoError, err:
2646                         self._downloader.trouble(u'ERROR: unable to download file')
2647
2648
2649 class FacebookIE(InfoExtractor):
2650         """Information Extractor for Facebook"""
2651
2652         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2653         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2654         _NETRC_MACHINE = 'facebook'
2655         _available_formats = ['highqual', 'lowqual']
2656         _video_extensions = {
2657                 'highqual': 'mp4',
2658                 'lowqual': 'mp4',
2659         }
2660         IE_NAME = u'facebook'
2661
2662         def __init__(self, downloader=None):
2663                 InfoExtractor.__init__(self, downloader)
2664
2665         def _reporter(self, message):
2666                 """Add header and report message."""
2667                 self._downloader.to_screen(u'[facebook] %s' % message)
2668
2669         def report_login(self):
2670                 """Report attempt to log in."""
2671                 self._reporter(u'Logging in')
2672
2673         def report_video_webpage_download(self, video_id):
2674                 """Report attempt to download video webpage."""
2675                 self._reporter(u'%s: Downloading video webpage' % video_id)
2676
2677         def report_information_extraction(self, video_id):
2678                 """Report attempt to extract video information."""
2679                 self._reporter(u'%s: Extracting video information' % video_id)
2680
2681         def _parse_page(self, video_webpage):
2682                 """Extract video information from page"""
2683                 # General data
2684                 data = {'title': r'class="video_title datawrap">(.*?)</',
2685                         'description': r'<div class="datawrap">(.*?)</div>',
2686                         'owner': r'\("video_owner_name", "(.*?)"\)',
2687                         'upload_date': r'data-date="(.*?)"',
2688                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2689                         }
2690                 video_info = {}
2691                 for piece in data.keys():
2692                         mobj = re.search(data[piece], video_webpage)
2693                         if mobj is not None:
2694                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2695
2696                 # Video urls
2697                 video_urls = {}
2698                 for fmt in self._available_formats:
2699                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2700                         if mobj is not None:
2701                                 # URL is in a Javascript segment inside an escaped Unicode format within
2702                                 # the generally utf-8 page
2703                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2704                 video_info['video_urls'] = video_urls
2705
2706                 return video_info
2707
2708         def _real_initialize(self):
2709                 if self._downloader is None:
2710                         return
2711
2712                 useremail = None
2713                 password = None
2714                 downloader_params = self._downloader.params
2715
2716                 # Attempt to use provided username and password or .netrc data
2717                 if downloader_params.get('username', None) is not None:
2718                         useremail = downloader_params['username']
2719                         password = downloader_params['password']
2720                 elif downloader_params.get('usenetrc', False):
2721                         try:
2722                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2723                                 if info is not None:
2724                                         useremail = info[0]
2725                                         password = info[2]
2726                                 else:
2727                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2728                         except (IOError, netrc.NetrcParseError), err:
2729                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2730                                 return
2731
2732                 if useremail is None:
2733                         return
2734
2735                 # Log in
2736                 login_form = {
2737                         'email': useremail,
2738                         'pass': password,
2739                         'login': 'Log+In'
2740                         }
2741                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2742                 try:
2743                         self.report_login()
2744                         login_results = urllib2.urlopen(request).read()
2745                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2746                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2747                                 return
2748                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2749                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2750                         return
2751
2752         def _real_extract(self, url):
2753                 mobj = re.match(self._VALID_URL, url)
2754                 if mobj is None:
2755                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2756                         return
2757                 video_id = mobj.group('ID')
2758
2759                 # Get video webpage
2760                 self.report_video_webpage_download(video_id)
2761                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2762                 try:
2763                         page = urllib2.urlopen(request)
2764                         video_webpage = page.read()
2765                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2766                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2767                         return
2768
2769                 # Start extracting information
2770                 self.report_information_extraction(video_id)
2771
2772                 # Extract information
2773                 video_info = self._parse_page(video_webpage)
2774
2775                 # uploader
2776                 if 'owner' not in video_info:
2777                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2778                         return
2779                 video_uploader = video_info['owner']
2780
2781                 # title
2782                 if 'title' not in video_info:
2783                         self._downloader.trouble(u'ERROR: unable to extract video title')
2784                         return
2785                 video_title = video_info['title']
2786                 video_title = video_title.decode('utf-8')
2787                 video_title = sanitize_title(video_title)
2788
2789                 # simplified title
2790                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2791                 simple_title = simple_title.strip(ur'_')
2792
2793                 # thumbnail image
2794                 if 'thumbnail' not in video_info:
2795                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2796                         video_thumbnail = ''
2797                 else:
2798                         video_thumbnail = video_info['thumbnail']
2799
2800                 # upload date
2801                 upload_date = u'NA'
2802                 if 'upload_date' in video_info:
2803                         upload_time = video_info['upload_date']
2804                         timetuple = email.utils.parsedate_tz(upload_time)
2805                         if timetuple is not None:
2806                                 try:
2807                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2808                                 except:
2809                                         pass
2810
2811                 # description
2812                 video_description = video_info.get('description', 'No description available.')
2813
2814                 url_map = video_info['video_urls']
2815                 if len(url_map.keys()) > 0:
2816                         # Decide which formats to download
2817                         req_format = self._downloader.params.get('format', None)
2818                         format_limit = self._downloader.params.get('format_limit', None)
2819
2820                         if format_limit is not None and format_limit in self._available_formats:
2821                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2822                         else:
2823                                 format_list = self._available_formats
2824                         existing_formats = [x for x in format_list if x in url_map]
2825                         if len(existing_formats) == 0:
2826                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2827                                 return
2828                         if req_format is None:
2829                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2830                         elif req_format == '-1':
2831                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2832                         else:
2833                                 # Specific format
2834                                 if req_format not in url_map:
2835                                         self._downloader.trouble(u'ERROR: requested format not available')
2836                                         return
2837                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2838
2839                 for format_param, video_real_url in video_url_list:
2840
2841                         # At this point we have a new video
2842                         self._downloader.increment_downloads()
2843
2844                         # Extension
2845                         video_extension = self._video_extensions.get(format_param, 'mp4')
2846
2847                         try:
2848                                 # Process video information
2849                                 self._downloader.process_info({
2850                                         'id':           video_id.decode('utf-8'),
2851                                         'url':          video_real_url.decode('utf-8'),
2852                                         'uploader':     video_uploader.decode('utf-8'),
2853                                         'upload_date':  upload_date,
2854                                         'title':        video_title,
2855                                         'stitle':       simple_title,
2856                                         'ext':          video_extension.decode('utf-8'),
2857                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2858                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2859                                         'description':  video_description.decode('utf-8'),
2860                                         'player_url':   None,
2861                                 })
2862                         except UnavailableVideoError, err:
2863                                 self._downloader.trouble(u'\nERROR: unable to download video')
2864
2865 class BlipTVIE(InfoExtractor):
2866         """Information extractor for blip.tv"""
2867
2868         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2869         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2870         IE_NAME = u'blip.tv'
2871
2872         def report_extraction(self, file_id):
2873                 """Report information extraction."""
2874                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2875
2876         def _simplify_title(self, title):
2877                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2878                 res = res.strip(ur'_')
2879                 return res
2880
2881         def _real_extract(self, url):
2882                 mobj = re.match(self._VALID_URL, url)
2883                 if mobj is None:
2884                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2885                         return
2886
2887                 if '?' in url:
2888                         cchar = '&'
2889                 else:
2890                         cchar = '?'
2891                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2892                 request = urllib2.Request(json_url)
2893                 self.report_extraction(mobj.group(1))
2894                 try:
2895                         json_code = urllib2.urlopen(request).read()
2896                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2897                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2898                         return
2899                 try:
2900                         json_data = json.loads(json_code)
2901                         if 'Post' in json_data:
2902                                 data = json_data['Post']
2903                         else:
2904                                 data = json_data
2905
2906                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2907                         video_url = data['media']['url']
2908                         umobj = re.match(self._URL_EXT, video_url)
2909                         if umobj is None:
2910                                 raise ValueError('Can not determine filename extension')
2911                         ext = umobj.group(1)
2912
2913                         self._downloader.increment_downloads()
2914
2915                         info = {
2916                                 'id': data['item_id'],
2917                                 'url': video_url,
2918                                 'uploader': data['display_name'],
2919                                 'upload_date': upload_date,
2920                                 'title': data['title'],
2921                                 'stitle': self._simplify_title(data['title']),
2922                                 'ext': ext,
2923                                 'format': data['media']['mimeType'],
2924                                 'thumbnail': data['thumbnailUrl'],
2925                                 'description': data['description'],
2926                                 'player_url': data['embedUrl']
2927                         }
2928                 except (ValueError,KeyError), err:
2929                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2930                         return
2931
2932                 try:
2933                         self._downloader.process_info(info)
2934                 except UnavailableVideoError, err:
2935                         self._downloader.trouble(u'\nERROR: unable to download video')
2936
2937
2938 class MyVideoIE(InfoExtractor):
2939         """Information Extractor for myvideo.de."""
2940
2941         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2942         IE_NAME = u'myvideo'
2943
2944         def __init__(self, downloader=None):
2945                 InfoExtractor.__init__(self, downloader)
2946         
2947         def report_download_webpage(self, video_id):
2948                 """Report webpage download."""
2949                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2950
2951         def report_extraction(self, video_id):
2952                 """Report information extraction."""
2953                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2954
2955         def _real_initialize(self):
2956                 return
2957
2958         def _real_extract(self,url):
2959                 mobj = re.match(self._VALID_URL, url)
2960                 if mobj is None:
2961                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2962                         return
2963
2964                 video_id = mobj.group(1)
2965                 simple_title = mobj.group(2).decode('utf-8')
2966                 # should actually not be necessary
2967                 simple_title = sanitize_title(simple_title)
2968                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2969
2970                 # Get video webpage
2971                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2972                 try:
2973                         self.report_download_webpage(video_id)
2974                         webpage = urllib2.urlopen(request).read()
2975                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2976                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2977                         return
2978
2979                 self.report_extraction(video_id)
2980                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2981                                  webpage)
2982                 if mobj is None:
2983                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2984                         return
2985                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2986
2987                 mobj = re.search('<title>([^<]+)</title>', webpage)
2988                 if mobj is None:
2989                         self._downloader.trouble(u'ERROR: unable to extract title')
2990                         return
2991
2992                 video_title = mobj.group(1)
2993                 video_title = sanitize_title(video_title)
2994
2995                 try:
2996                         print(video_url)
2997                         self._downloader.process_info({
2998                                 'id':           video_id,
2999                                 'url':          video_url,
3000                                 'uploader':     u'NA',
3001                                 'upload_date':  u'NA',
3002                                 'title':        video_title,
3003                                 'stitle':       simple_title,
3004                                 'ext':          u'flv',
3005                                 'format':       u'NA',
3006                                 'player_url':   None,
3007                         })
3008                 except UnavailableVideoError:
3009                         self._downloader.trouble(u'\nERROR: Unable to download video')
3010
3011 class ComedyCentralIE(InfoExtractor):
3012         """Information extractor for The Daily Show and Colbert Report """
3013
3014         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3015         IE_NAME = u'comedycentral'
3016
3017         def report_extraction(self, episode_id):
3018                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3019         
3020         def report_config_download(self, episode_id):
3021                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3022
3023         def report_index_download(self, episode_id):
3024                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3025
3026         def report_player_url(self, episode_id):
3027                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3028
3029         def _simplify_title(self, title):
3030                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3031                 res = res.strip(ur'_')
3032                 return res
3033
3034         def _real_extract(self, url):
3035                 mobj = re.match(self._VALID_URL, url)
3036                 if mobj is None:
3037                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3038                         return
3039
3040                 if mobj.group('shortname'):
3041                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3042                                 url = 'http://www.thedailyshow.com/full-episodes/'
3043                         else:
3044                                 url = 'http://www.colbertnation.com/full-episodes/'
3045                         mobj = re.match(self._VALID_URL, url)
3046                         assert mobj is not None
3047
3048                 dlNewest = not mobj.group('episode')
3049                 if dlNewest:
3050                         epTitle = mobj.group('showname')
3051                 else:
3052                         epTitle = mobj.group('episode')
3053
3054                 req = urllib2.Request(url)
3055                 self.report_extraction(epTitle)
3056                 try:
3057                         htmlHandle = urllib2.urlopen(req)
3058                         html = htmlHandle.read()
3059                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3060                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3061                         return
3062                 if dlNewest:
3063                         url = htmlHandle.geturl()
3064                         mobj = re.match(self._VALID_URL, url)
3065                         if mobj is None:
3066                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3067                                 return
3068                         if mobj.group('episode') == '':
3069                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3070                                 return
3071                         epTitle = mobj.group('episode')
3072
3073                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3074                 if len(mMovieParams) == 0:
3075                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3076                         return
3077
3078                 playerUrl_raw = mMovieParams[0][0]
3079                 self.report_player_url(epTitle)
3080                 try:
3081                         urlHandle = urllib2.urlopen(playerUrl_raw)
3082                         playerUrl = urlHandle.geturl()
3083                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3084                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3085                         return
3086
3087                 uri = mMovieParams[0][1]
3088                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3089                 self.report_index_download(epTitle)
3090                 try:
3091                         indexXml = urllib2.urlopen(indexUrl).read()
3092                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3093                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3094                         return
3095
3096                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3097                 itemEls = idoc.findall('.//item')
3098                 for itemEl in itemEls:
3099                         mediaId = itemEl.findall('./guid')[0].text
3100                         shortMediaId = mediaId.split(':')[-1]
3101                         showId = mediaId.split(':')[-2].replace('.com', '')
3102                         officialTitle = itemEl.findall('./title')[0].text
3103                         officialDate = itemEl.findall('./pubDate')[0].text
3104
3105                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3106                                                 urllib.urlencode({'uri': mediaId}))
3107                         configReq = urllib2.Request(configUrl)
3108                         self.report_config_download(epTitle)
3109                         try:
3110                                 configXml = urllib2.urlopen(configReq).read()
3111                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3112                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3113                                 return
3114
3115                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3116                         turls = []
3117                         for rendition in cdoc.findall('.//rendition'):
3118                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3119                                 turls.append(finfo)
3120
3121                         if len(turls) == 0:
3122                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3123                                 continue
3124
3125                         # For now, just pick the highest bitrate
3126                         format,video_url = turls[-1]
3127
3128                         self._downloader.increment_downloads()
3129
3130                         effTitle = showId + '-' + epTitle
3131                         info = {
3132                                 'id': shortMediaId,
3133                                 'url': video_url,
3134                                 'uploader': showId,
3135                                 'upload_date': officialDate,
3136                                 'title': effTitle,
3137                                 'stitle': self._simplify_title(effTitle),
3138                                 'ext': 'mp4',
3139                                 'format': format,
3140                                 'thumbnail': None,
3141                                 'description': officialTitle,
3142                                 'player_url': playerUrl
3143                         }
3144
3145                         try:
3146                                 self._downloader.process_info(info)
3147                         except UnavailableVideoError, err:
3148                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3149                                 continue
3150
3151
3152 class EscapistIE(InfoExtractor):
3153         """Information extractor for The Escapist """
3154
3155         _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3156         IE_NAME = u'escapist'
3157
3158         def report_extraction(self, showName):
3159                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3160
3161         def report_config_download(self, showName):
3162                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3163
3164         def _simplify_title(self, title):
3165                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3166                 res = res.strip(ur'_')
3167                 return res
3168
3169         def _real_extract(self, url):
3170                 htmlParser = HTMLParser.HTMLParser()
3171
3172                 mobj = re.match(self._VALID_URL, url)
3173                 if mobj is None:
3174                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3175                         return
3176                 showName = mobj.group('showname')
3177                 videoId = mobj.group('episode')
3178
3179                 self.report_extraction(showName)
3180                 try:
3181                         webPage = urllib2.urlopen(url).read()
3182                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3183                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3184                         return
3185
3186                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3187                 description = htmlParser.unescape(descMatch.group(1))
3188                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3189                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3190                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3191                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3192                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3193                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3194
3195                 self.report_config_download(showName)
3196                 try:
3197                         configJSON = urllib2.urlopen(configUrl).read()
3198                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3199                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3200                         return
3201
3202                 # Technically, it's JavaScript, not JSON
3203                 configJSON = configJSON.replace("'", '"')
3204
3205                 try:
3206                         config = json.loads(configJSON)
3207                 except (ValueError,), err:
3208                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3209                         return
3210
3211                 playlist = config['playlist']
3212                 videoUrl = playlist[1]['url']
3213
3214                 self._downloader.increment_downloads()
3215                 info = {
3216                         'id': videoId,
3217                         'url': videoUrl,
3218                         'uploader': showName,
3219                         'upload_date': None,
3220                         'title': showName,
3221                         'stitle': self._simplify_title(showName),
3222                         'ext': 'flv',
3223                         'format': 'flv',
3224                         'thumbnail': imgUrl,
3225                         'description': description,
3226                         'player_url': playerUrl,
3227                 }
3228
3229                 try:
3230                         self._downloader.process_info(info)
3231                 except UnavailableVideoError, err:
3232                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3233
3234
3235
3236 class PostProcessor(object):
3237         """Post Processor class.
3238
3239         PostProcessor objects can be added to downloaders with their
3240         add_post_processor() method. When the downloader has finished a
3241         successful download, it will take its internal chain of PostProcessors
3242         and start calling the run() method on each one of them, first with
3243         an initial argument and then with the returned value of the previous
3244         PostProcessor.
3245
3246         The chain will be stopped if one of them ever returns None or the end
3247         of the chain is reached.
3248
3249         PostProcessor objects follow a "mutual registration" process similar
3250         to InfoExtractor objects.
3251         """
3252
3253         _downloader = None
3254
3255         def __init__(self, downloader=None):
3256                 self._downloader = downloader
3257
3258         def set_downloader(self, downloader):
3259                 """Sets the downloader for this PP."""
3260                 self._downloader = downloader
3261
3262         def run(self, information):
3263                 """Run the PostProcessor.
3264
3265                 The "information" argument is a dictionary like the ones
3266                 composed by InfoExtractors. The only difference is that this
3267                 one has an extra field called "filepath" that points to the
3268                 downloaded file.
3269
3270                 When this method returns None, the postprocessing chain is
3271                 stopped. However, this method may return an information
3272                 dictionary that will be passed to the next postprocessing
3273                 object in the chain. It can be the one it received after
3274                 changing some fields.
3275
3276                 In addition, this method may raise a PostProcessingError
3277                 exception that will be taken into account by the downloader
3278                 it was called from.
3279                 """
3280                 return information # by default, do nothing
3281
3282
3283 class FFmpegExtractAudioPP(PostProcessor):
3284
3285         def __init__(self, downloader=None, preferredcodec=None):
3286                 PostProcessor.__init__(self, downloader)
3287                 if preferredcodec is None:
3288                         preferredcodec = 'best'
3289                 self._preferredcodec = preferredcodec
3290
3291         @staticmethod
3292         def get_audio_codec(path):
3293                 try:
3294                         cmd = ['ffprobe', '-show_streams', '--', path]
3295                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3296                         output = handle.communicate()[0]
3297                         if handle.wait() != 0:
3298                                 return None
3299                 except (IOError, OSError):
3300                         return None
3301                 audio_codec = None
3302                 for line in output.split('\n'):
3303                         if line.startswith('codec_name='):
3304                                 audio_codec = line.split('=')[1].strip()
3305                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3306                                 return audio_codec
3307                 return None
3308
3309         @staticmethod
3310         def run_ffmpeg(path, out_path, codec, more_opts):
3311                 try:
3312                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3313                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3314                         return (ret == 0)
3315                 except (IOError, OSError):
3316                         return False
3317
3318         def run(self, information):
3319                 path = information['filepath']
3320
3321                 filecodec = self.get_audio_codec(path)
3322                 if filecodec is None:
3323                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3324                         return None
3325
3326                 more_opts = []
3327                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3328                         if filecodec == 'aac' or filecodec == 'mp3':
3329                                 # Lossless if possible
3330                                 acodec = 'copy'
3331                                 extension = filecodec
3332                                 if filecodec == 'aac':
3333                                         more_opts = ['-f', 'adts']
3334                         else:
3335                                 # MP3 otherwise.
3336                                 acodec = 'libmp3lame'
3337                                 extension = 'mp3'
3338                                 more_opts = ['-ab', '128k']
3339                 else:
3340                         # We convert the audio (lossy)
3341                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3342                         extension = self._preferredcodec
3343                         more_opts = ['-ab', '128k']
3344                         if self._preferredcodec == 'aac':
3345                                 more_opts += ['-f', 'adts']
3346
3347                 (prefix, ext) = os.path.splitext(path)
3348                 new_path = prefix + '.' + extension
3349                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3350                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3351
3352                 if not status:
3353                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3354                         return None
3355
3356                 try:
3357                         os.remove(path)
3358                 except (IOError, OSError):
3359                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3360                         return None
3361
3362                 information['filepath'] = new_path
3363                 return information
3364
3365
3366 def updateSelf(downloader, filename):
3367         ''' Update the program file with the latest version from the repository '''
3368         # Note: downloader only used for options
3369         if not os.access(filename, os.W_OK):
3370                 sys.exit('ERROR: no write permissions on %s' % filename)
3371
3372         downloader.to_screen('Updating to latest version...')
3373
3374         try:
3375                 try:
3376                         urlh = urllib.urlopen(UPDATE_URL)
3377                         newcontent = urlh.read()
3378                 finally:
3379                         urlh.close()
3380         except (IOError, OSError), err:
3381                 sys.exit('ERROR: unable to download latest version')
3382
3383         try:
3384                 outf = open(filename, 'wb')
3385                 try:
3386                         outf.write(newcontent)
3387                 finally:
3388                         outf.close()
3389         except (IOError, OSError), err:
3390                 sys.exit('ERROR: unable to overwrite current version')
3391
3392         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3393
3394 def parseOpts():
3395         # Deferred imports
3396         import getpass
3397         import optparse
3398
3399         def _format_option_string(option):
3400                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3401
3402                 opts = []
3403
3404                 if option._short_opts: opts.append(option._short_opts[0])
3405                 if option._long_opts: opts.append(option._long_opts[0])
3406                 if len(opts) > 1: opts.insert(1, ', ')
3407
3408                 if option.takes_value(): opts.append(' %s' % option.metavar)
3409
3410                 return "".join(opts)
3411
3412         def _find_term_columns():
3413                 columns = os.environ.get('COLUMNS', None)
3414                 if columns:
3415                         return int(columns)
3416
3417                 try:
3418                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3419                         out,err = sp.communicate()
3420                         return int(out.split()[1])
3421                 except:
3422                         pass
3423                 return None
3424
3425         max_width = 80
3426         max_help_position = 80
3427
3428         # No need to wrap help messages if we're on a wide console
3429         columns = _find_term_columns()
3430         if columns: max_width = columns
3431
3432         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3433         fmt.format_option_strings = _format_option_string
3434
3435         kw = {
3436                 'version'   : __version__,
3437                 'formatter' : fmt,
3438                 'usage' : '%prog [options] url [url...]',
3439                 'conflict_handler' : 'resolve',
3440         }
3441
3442         parser = optparse.OptionParser(**kw)
3443
3444         # option groups
3445         general        = optparse.OptionGroup(parser, 'General Options')
3446         selection      = optparse.OptionGroup(parser, 'Video Selection')
3447         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3448         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3449         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3450         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3451         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3452
3453         general.add_option('-h', '--help',
3454                         action='help', help='print this help text and exit')
3455         general.add_option('-v', '--version',
3456                         action='version', help='print program version and exit')
3457         general.add_option('-U', '--update',
3458                         action='store_true', dest='update_self', help='update this program to latest version')
3459         general.add_option('-i', '--ignore-errors',
3460                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3461         general.add_option('-r', '--rate-limit',
3462                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3463         general.add_option('-R', '--retries',
3464                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3465         general.add_option('--dump-user-agent',
3466                         action='store_true', dest='dump_user_agent',
3467                         help='display the current browser identification', default=False)
3468         general.add_option('--list-extractors',
3469                         action='store_true', dest='list_extractors',
3470                         help='List all supported extractors and the URLs they would handle', default=False)
3471
3472         selection.add_option('--playlist-start',
3473                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3474         selection.add_option('--playlist-end',
3475                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3476         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3477         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3478
3479         authentication.add_option('-u', '--username',
3480                         dest='username', metavar='USERNAME', help='account username')
3481         authentication.add_option('-p', '--password',
3482                         dest='password', metavar='PASSWORD', help='account password')
3483         authentication.add_option('-n', '--netrc',
3484                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3485
3486
3487         video_format.add_option('-f', '--format',
3488                         action='store', dest='format', metavar='FORMAT', help='video format code')
3489         video_format.add_option('--all-formats',
3490                         action='store_const', dest='format', help='download all available video formats', const='-1')
3491         video_format.add_option('--max-quality',
3492                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3493
3494
3495         verbosity.add_option('-q', '--quiet',
3496                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3497         verbosity.add_option('-s', '--simulate',
3498                         action='store_true', dest='simulate', help='do not download video', default=False)
3499         verbosity.add_option('-g', '--get-url',
3500                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3501         verbosity.add_option('-e', '--get-title',
3502                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3503         verbosity.add_option('--get-thumbnail',
3504                         action='store_true', dest='getthumbnail',
3505                         help='simulate, quiet but print thumbnail URL', default=False)
3506         verbosity.add_option('--get-description',
3507                         action='store_true', dest='getdescription',
3508                         help='simulate, quiet but print video description', default=False)
3509         verbosity.add_option('--get-filename',
3510                         action='store_true', dest='getfilename',
3511                         help='simulate, quiet but print output filename', default=False)
3512         verbosity.add_option('--no-progress',
3513                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3514         verbosity.add_option('--console-title',
3515                         action='store_true', dest='consoletitle',
3516                         help='display progress in console titlebar', default=False)
3517
3518
3519         filesystem.add_option('-t', '--title',
3520                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3521         filesystem.add_option('-l', '--literal',
3522                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3523         filesystem.add_option('-A', '--auto-number',
3524                         action='store_true', dest='autonumber',
3525                         help='number downloaded files starting from 00000', default=False)
3526         filesystem.add_option('-o', '--output',
3527                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3528         filesystem.add_option('-a', '--batch-file',
3529                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3530         filesystem.add_option('-w', '--no-overwrites',
3531                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3532         filesystem.add_option('-c', '--continue',
3533                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3534         filesystem.add_option('--cookies',
3535                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3536         filesystem.add_option('--no-part',
3537                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3538         filesystem.add_option('--no-mtime',
3539                         action='store_false', dest='updatetime',
3540                         help='do not use the Last-modified header to set the file modification time', default=True)
3541         filesystem.add_option('--write-description',
3542                         action='store_true', dest='writedescription',
3543                         help='write video description to a .description file', default=False)
3544         filesystem.add_option('--write-info-json',
3545                         action='store_true', dest='writeinfojson',
3546                         help='write video metadata to a .info.json file', default=False)
3547
3548
3549         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3550                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3551         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3552                         help='"best", "aac" or "mp3"; best by default')
3553
3554
3555         parser.add_option_group(general)
3556         parser.add_option_group(selection)
3557         parser.add_option_group(filesystem)
3558         parser.add_option_group(verbosity)
3559         parser.add_option_group(video_format)
3560         parser.add_option_group(authentication)
3561         parser.add_option_group(postproc)
3562
3563         opts, args = parser.parse_args()
3564
3565         return parser, opts, args
3566
3567 def gen_extractors():
3568         """ Return a list of an instance of every supported extractor.
3569         The order does matter; the first extractor matched is the one handling the URL.
3570         """
3571         youtube_ie = YoutubeIE()
3572         google_ie = GoogleIE()
3573         yahoo_ie = YahooIE()
3574         return [
3575                 youtube_ie,
3576                 MetacafeIE(youtube_ie),
3577                 DailymotionIE(),
3578                 YoutubePlaylistIE(youtube_ie),
3579                 YoutubeUserIE(youtube_ie),
3580                 YoutubeSearchIE(youtube_ie),
3581                 google_ie,
3582                 GoogleSearchIE(google_ie),
3583                 PhotobucketIE(),
3584                 yahoo_ie,
3585                 YahooSearchIE(yahoo_ie),
3586                 DepositFilesIE(),
3587                 FacebookIE(),
3588                 BlipTVIE(),
3589                 VimeoIE(),
3590                 MyVideoIE(),
3591                 ComedyCentralIE(),
3592                 EscapistIE(),
3593
3594                 GenericIE()
3595         ]
3596
3597 def main():
3598         parser, opts, args = parseOpts()
3599
3600         # Open appropriate CookieJar
3601         if opts.cookiefile is None:
3602                 jar = cookielib.CookieJar()
3603         else:
3604                 try:
3605                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3606                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3607                                 jar.load()
3608                 except (IOError, OSError), err:
3609                         sys.exit(u'ERROR: unable to open cookie file')
3610
3611         # Dump user agent
3612         if opts.dump_user_agent:
3613                 print std_headers['User-Agent']
3614                 sys.exit(0)
3615
3616         # Batch file verification
3617         batchurls = []
3618         if opts.batchfile is not None:
3619                 try:
3620                         if opts.batchfile == '-':
3621                                 batchfd = sys.stdin
3622                         else:
3623                                 batchfd = open(opts.batchfile, 'r')
3624                         batchurls = batchfd.readlines()
3625                         batchurls = [x.strip() for x in batchurls]
3626                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3627                 except IOError:
3628                         sys.exit(u'ERROR: batch file could not be read')
3629         all_urls = batchurls + args
3630
3631         # General configuration
3632         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3633         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3634         urllib2.install_opener(opener)
3635         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3636
3637         extractors = gen_extractors()
3638
3639         if opts.list_extractors:
3640                 for ie in extractors:
3641                         print(ie.IE_NAME)
3642                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3643                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3644                         for mu in matchedUrls:
3645                                 print(u'  ' + mu)
3646                 sys.exit(0)
3647
3648         # Conflicting, missing and erroneous options
3649         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3650                 parser.error(u'using .netrc conflicts with giving username/password')
3651         if opts.password is not None and opts.username is None:
3652                 parser.error(u'account username missing')
3653         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3654                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3655         if opts.usetitle and opts.useliteral:
3656                 parser.error(u'using title conflicts with using literal title')
3657         if opts.username is not None and opts.password is None:
3658                 opts.password = getpass.getpass(u'Type account password and press return:')
3659         if opts.ratelimit is not None:
3660                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3661                 if numeric_limit is None:
3662                         parser.error(u'invalid rate limit specified')
3663                 opts.ratelimit = numeric_limit
3664         if opts.retries is not None:
3665                 try:
3666                         opts.retries = long(opts.retries)
3667                 except (TypeError, ValueError), err:
3668                         parser.error(u'invalid retry count specified')
3669         try:
3670                 opts.playliststart = int(opts.playliststart)
3671                 if opts.playliststart <= 0:
3672                         raise ValueError(u'Playlist start must be positive')
3673         except (TypeError, ValueError), err:
3674                 parser.error(u'invalid playlist start number specified')
3675         try:
3676                 opts.playlistend = int(opts.playlistend)
3677                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3678                         raise ValueError(u'Playlist end must be greater than playlist start')
3679         except (TypeError, ValueError), err:
3680                 parser.error(u'invalid playlist end number specified')
3681         if opts.extractaudio:
3682                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3683                         parser.error(u'invalid audio format specified')
3684
3685         # File downloader
3686         fd = FileDownloader({
3687                 'usenetrc': opts.usenetrc,
3688                 'username': opts.username,
3689                 'password': opts.password,
3690                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3691                 'forceurl': opts.geturl,
3692                 'forcetitle': opts.gettitle,
3693                 'forcethumbnail': opts.getthumbnail,
3694                 'forcedescription': opts.getdescription,
3695                 'forcefilename': opts.getfilename,
3696                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3697                 'format': opts.format,
3698                 'format_limit': opts.format_limit,
3699                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3700                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3701                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3702                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3703                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3704                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3705                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3706                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3707                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3708                         or u'%(id)s.%(ext)s'),
3709                 'ignoreerrors': opts.ignoreerrors,
3710                 'ratelimit': opts.ratelimit,
3711                 'nooverwrites': opts.nooverwrites,
3712                 'retries': opts.retries,
3713                 'continuedl': opts.continue_dl,
3714                 'noprogress': opts.noprogress,
3715                 'playliststart': opts.playliststart,
3716                 'playlistend': opts.playlistend,
3717                 'logtostderr': opts.outtmpl == '-',
3718                 'consoletitle': opts.consoletitle,
3719                 'nopart': opts.nopart,
3720                 'updatetime': opts.updatetime,
3721                 'writedescription': opts.writedescription,
3722                 'writeinfojson': opts.writeinfojson,
3723                 'matchtitle': opts.matchtitle,
3724                 'rejecttitle': opts.rejecttitle,
3725                 })
3726         for extractor in extractors:
3727                 fd.add_info_extractor(extractor)
3728
3729         # PostProcessors
3730         if opts.extractaudio:
3731                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3732
3733         # Update version
3734         if opts.update_self:
3735                 updateSelf(fd, sys.argv[0])
3736
3737         # Maybe do nothing
3738         if len(all_urls) < 1:
3739                 if not opts.update_self:
3740                         parser.error(u'you must provide at least one URL')
3741                 else:
3742                         sys.exit()
3743         retcode = fd.download(all_urls)
3744
3745         # Dump cookie jar if requested
3746         if opts.cookiefile is not None:
3747                 try:
3748                         jar.save()
3749                 except (IOError, OSError), err:
3750                         sys.exit(u'ERROR: unable to save cookie jar')
3751
3752         sys.exit(retcode)
3753
3754
3755 if __name__ == '__main__':
3756         try:
3757                 main()
3758         except DownloadError:
3759                 sys.exit(1)
3760         except SameFileError:
3761                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3762         except KeyboardInterrupt:
3763                 sys.exit(u'\nERROR: Interrupted by user')
3764
3765 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: