cd8e57b06634538842acc07853325729fde44ffa
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         )
16
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.15'
19
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
21
22 import cookielib
23 import datetime
24 import gzip
25 import htmlentitydefs
26 import HTMLParser
27 import httplib
28 import locale
29 import math
30 import netrc
31 import os
32 import os.path
33 import re
34 import socket
35 import string
36 import subprocess
37 import sys
38 import time
39 import urllib
40 import urllib2
41 import warnings
42 import zlib
43
44 if os.name == 'nt':
45         import ctypes
46
47 try:
48         import email.utils
49 except ImportError: # Python 2.4
50         import email.Utils
51 try:
52         import cStringIO as StringIO
53 except ImportError:
54         import StringIO
55
56 # parse_qs was moved from the cgi module to the urlparse module recently.
57 try:
58         from urlparse import parse_qs
59 except ImportError:
60         from cgi import parse_qs
61
62 try:
63         import lxml.etree
64 except ImportError:
65         pass # Handled below
66
67 try:
68         import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
71
72 std_headers = {
73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76         'Accept-Encoding': 'gzip, deflate',
77         'Accept-Language': 'en-us,en;q=0.5',
78 }
79
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
82 try:
83         import json
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
85         import re
86         class json(object):
87                 @staticmethod
88                 def loads(s):
89                         s = s.decode('UTF-8')
90                         def raiseError(msg, i):
91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92                         def skipSpace(i, expectMore=True):
93                                 while i < len(s) and s[i] in ' \t\r\n':
94                                         i += 1
95                                 if expectMore:
96                                         if i >= len(s):
97                                                 raiseError('Premature end', i)
98                                 return i
99                         def decodeEscape(match):
100                                 esc = match.group(1)
101                                 _STATIC = {
102                                         '"': '"',
103                                         '\\': '\\',
104                                         '/': '/',
105                                         'b': unichr(0x8),
106                                         'f': unichr(0xc),
107                                         'n': '\n',
108                                         'r': '\r',
109                                         't': '\t',
110                                 }
111                                 if esc in _STATIC:
112                                         return _STATIC[esc]
113                                 if esc[0] == 'u':
114                                         if len(esc) == 1+4:
115                                                 return unichr(int(esc[1:5], 16))
116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
117                                                 hi = int(esc[1:5], 16)
118                                                 low = int(esc[7:11], 16)
119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120                                 raise ValueError('Unknown escape ' + str(esc))
121                         def parseString(i):
122                                 i += 1
123                                 e = i
124                                 while True:
125                                         e = s.index('"', e)
126                                         bslashes = 0
127                                         while s[e-bslashes-1] == '\\':
128                                                 bslashes += 1
129                                         if bslashes % 2 == 1:
130                                                 e += 1
131                                                 continue
132                                         break
133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134                                 stri = rexp.sub(decodeEscape, s[i:e])
135                                 return (e+1,stri)
136                         def parseObj(i):
137                                 i += 1
138                                 res = {}
139                                 i = skipSpace(i)
140                                 if s[i] == '}': # Empty dictionary
141                                         return (i+1,res)
142                                 while True:
143                                         if s[i] != '"':
144                                                 raiseError('Expected a string object key', i)
145                                         i,key = parseString(i)
146                                         i = skipSpace(i)
147                                         if i >= len(s) or s[i] != ':':
148                                                 raiseError('Expected a colon', i)
149                                         i,val = parse(i+1)
150                                         res[key] = val
151                                         i = skipSpace(i)
152                                         if s[i] == '}':
153                                                 return (i+1, res)
154                                         if s[i] != ',':
155                                                 raiseError('Expected comma or closing curly brace', i)
156                                         i = skipSpace(i+1)
157                         def parseArray(i):
158                                 res = []
159                                 i = skipSpace(i+1)
160                                 if s[i] == ']': # Empty array
161                                         return (i+1,res)
162                                 while True:
163                                         i,val = parse(i)
164                                         res.append(val)
165                                         i = skipSpace(i) # Raise exception if premature end
166                                         if s[i] == ']':
167                                                 return (i+1, res)
168                                         if s[i] != ',':
169                                                 raiseError('Expected a comma or closing bracket', i)
170                                         i = skipSpace(i+1)
171                         def parseDiscrete(i):
172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
173                                         if s.startswith(k, i):
174                                                 return (i+len(k), v)
175                                 raiseError('Not a boolean (or null)', i)
176                         def parseNumber(i):
177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178                                 if mobj is None:
179                                         raiseError('Not a number', i)
180                                 nums = mobj.group(1)
181                                 if '.' in nums or 'e' in nums or 'E' in nums:
182                                         return (i+len(nums), float(nums))
183                                 return (i+len(nums), int(nums))
184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185                         def parse(i):
186                                 i = skipSpace(i)
187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
188                                 i = skipSpace(i, False)
189                                 return (i,res)
190                         i,res = parse(0)
191                         if i < len(s):
192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193                         return res
194
195 def preferredencoding():
196         """Get preferred encoding.
197
198         Returns the best encoding scheme for the system, based on
199         locale.getpreferredencoding() and some further tweaks.
200         """
201         def yield_preferredencoding():
202                 try:
203                         pref = locale.getpreferredencoding()
204                         u'TEST'.encode(pref)
205                 except:
206                         pref = 'UTF-8'
207                 while True:
208                         yield pref
209         return yield_preferredencoding().next()
210
211
212 def htmlentity_transform(matchobj):
213         """Transforms an HTML entity to a Unicode character.
214
215         This function receives a match object and is intended to be used with
216         the re.sub() function.
217         """
218         entity = matchobj.group(1)
219
220         # Known non-numeric HTML entity
221         if entity in htmlentitydefs.name2codepoint:
222                 return unichr(htmlentitydefs.name2codepoint[entity])
223
224         # Unicode character
225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
226         if mobj is not None:
227                 numstr = mobj.group(1)
228                 if numstr.startswith(u'x'):
229                         base = 16
230                         numstr = u'0%s' % numstr
231                 else:
232                         base = 10
233                 return unichr(long(numstr, base))
234
235         # Unknown entity in name, return its literal representation
236         return (u'&%s;' % entity)
237
238
239 def sanitize_title(utitle):
240         """Sanitizes a video title so it could be used as part of a filename."""
241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242         return utitle.replace(unicode(os.sep), u'%')
243
244
245 def sanitize_open(filename, open_mode):
246         """Try to open the given filename, and slightly tweak it if this fails.
247
248         Attempts to open the given filename. If this fails, it tries to change
249         the filename slightly, step by step, until it's either able to open it
250         or it fails and raises a final exception, like the standard open()
251         function.
252
253         It returns the tuple (stream, definitive_file_name).
254         """
255         try:
256                 if filename == u'-':
257                         if sys.platform == 'win32':
258                                 import msvcrt
259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260                         return (sys.stdout, filename)
261                 stream = open(filename, open_mode)
262                 return (stream, filename)
263         except (IOError, OSError), err:
264                 # In case of error, try to remove win32 forbidden chars
265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
266
267                 # An exception here should be caught in the caller
268                 stream = open(filename, open_mode)
269                 return (stream, filename)
270
271
272 def timeconvert(timestr):
273         """Convert RFC 2822 defined time string into system timestamp"""
274         timestamp = None
275         timetuple = email.utils.parsedate_tz(timestr)
276         if timetuple is not None:
277                 timestamp = email.utils.mktime_tz(timetuple)
278         return timestamp
279
280
281 class DownloadError(Exception):
282         """Download Error exception.
283
284         This exception may be thrown by FileDownloader objects if they are not
285         configured to continue on errors. They will contain the appropriate
286         error message.
287         """
288         pass
289
290
291 class SameFileError(Exception):
292         """Same File exception.
293
294         This exception will be thrown by FileDownloader objects if they detect
295         multiple files would have to be downloaded to the same file on disk.
296         """
297         pass
298
299
300 class PostProcessingError(Exception):
301         """Post Processing exception.
302
303         This exception may be raised by PostProcessor's .run() method to
304         indicate an error in the postprocessing task.
305         """
306         pass
307
308
309 class UnavailableVideoError(Exception):
310         """Unavailable Format exception.
311
312         This exception will be thrown when a video is requested
313         in a format that is not available for that video.
314         """
315         pass
316
317
318 class ContentTooShortError(Exception):
319         """Content Too Short exception.
320
321         This exception may be raised by FileDownloader objects when a file they
322         download is too small for what the server announced first, indicating
323         the connection was probably interrupted.
324         """
325         # Both in bytes
326         downloaded = None
327         expected = None
328
329         def __init__(self, downloaded, expected):
330                 self.downloaded = downloaded
331                 self.expected = expected
332
333
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335         """Handler for HTTP requests and responses.
336
337         This class, when installed with an OpenerDirector, automatically adds
338         the standard headers to every HTTP request and handles gzipped and
339         deflated responses from web servers. If compression is to be avoided in
340         a particular request, the original request in the program code only has
341         to include the HTTP header "Youtubedl-No-Compression", which will be
342         removed before making the real request.
343
344         Part of this code was copied from:
345
346         http://techknack.net/python-urllib2-handlers/
347
348         Andrew Rowls, the author of that code, agreed to release it to the
349         public domain.
350         """
351
352         @staticmethod
353         def deflate(data):
354                 try:
355                         return zlib.decompress(data, -zlib.MAX_WBITS)
356                 except zlib.error:
357                         return zlib.decompress(data)
358
359         @staticmethod
360         def addinfourl_wrapper(stream, headers, url, code):
361                 if hasattr(urllib2.addinfourl, 'getcode'):
362                         return urllib2.addinfourl(stream, headers, url, code)
363                 ret = urllib2.addinfourl(stream, headers, url)
364                 ret.code = code
365                 return ret
366
367         def http_request(self, req):
368                 for h in std_headers:
369                         if h in req.headers:
370                                 del req.headers[h]
371                         req.add_header(h, std_headers[h])
372                 if 'Youtubedl-no-compression' in req.headers:
373                         if 'Accept-encoding' in req.headers:
374                                 del req.headers['Accept-encoding']
375                         del req.headers['Youtubedl-no-compression']
376                 return req
377
378         def http_response(self, req, resp):
379                 old_resp = resp
380                 # gzip
381                 if resp.headers.get('Content-encoding', '') == 'gzip':
382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384                         resp.msg = old_resp.msg
385                 # deflate
386                 if resp.headers.get('Content-encoding', '') == 'deflate':
387                         gz = StringIO.StringIO(self.deflate(resp.read()))
388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389                         resp.msg = old_resp.msg
390                 return resp
391
392
393 class FileDownloader(object):
394         """File Downloader class.
395
396         File downloader objects are the ones responsible of downloading the
397         actual video file and writing it to disk if the user has requested
398         it, among some other tasks. In most cases there should be one per
399         program. As, given a video URL, the downloader doesn't know how to
400         extract all the needed information, task that InfoExtractors do, it
401         has to pass the URL to one of them.
402
403         For this, file downloader objects have a method that allows
404         InfoExtractors to be registered in a given order. When it is passed
405         a URL, the file downloader handles it to the first InfoExtractor it
406         finds that reports being able to handle it. The InfoExtractor extracts
407         all the information about the video or videos the URL refers to, and
408         asks the FileDownloader to process the video information, possibly
409         downloading the video.
410
411         File downloaders accept a lot of parameters. In order not to saturate
412         the object constructor with arguments, it receives a dictionary of
413         options instead. These options are available through the params
414         attribute for the InfoExtractors to use. The FileDownloader also
415         registers itself as the downloader in charge for the InfoExtractors
416         that are added to it, so this is a "mutual registration".
417
418         Available options:
419
420         username:         Username for authentication purposes.
421         password:         Password for authentication purposes.
422         usenetrc:         Use netrc for authentication instead.
423         quiet:            Do not print messages to stdout.
424         forceurl:         Force printing final URL.
425         forcetitle:       Force printing title.
426         forcethumbnail:   Force printing thumbnail URL.
427         forcedescription: Force printing description.
428         forcefilename:    Force printing final filename.
429         simulate:         Do not download the video files.
430         format:           Video format code.
431         format_limit:     Highest quality format to try.
432         outtmpl:          Template for output names.
433         ignoreerrors:     Do not stop on download errors.
434         ratelimit:        Download speed limit, in bytes/sec.
435         nooverwrites:     Prevent overwriting files.
436         retries:          Number of times to retry for HTTP error 5xx
437         continuedl:       Try to continue downloads if possible.
438         noprogress:       Do not print the progress bar.
439         playliststart:    Playlist item to start at.
440         playlistend:      Playlist item to end at.
441         matchtitle:       Download only matching titles.
442         rejecttitle:      Reject downloads for matching titles.
443         logtostderr:      Log messages to stderr instead of stdout.
444         consoletitle:     Display progress in console window's titlebar.
445         nopart:           Do not use temporary .part files.
446         updatetime:       Use the Last-modified header to set output file timestamps.
447         writedescription: Write the video description to a .description file
448         writeinfojson:    Write the video description to a .info.json file
449         """
450
451         params = None
452         _ies = []
453         _pps = []
454         _download_retcode = None
455         _num_downloads = None
456         _screen_file = None
457
458         def __init__(self, params):
459                 """Create a FileDownloader object with the given options."""
460                 self._ies = []
461                 self._pps = []
462                 self._download_retcode = 0
463                 self._num_downloads = 0
464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
465                 self.params = params
466
467         @staticmethod
468         def format_bytes(bytes):
469                 if bytes is None:
470                         return 'N/A'
471                 if type(bytes) is str:
472                         bytes = float(bytes)
473                 if bytes == 0.0:
474                         exponent = 0
475                 else:
476                         exponent = long(math.log(bytes, 1024.0))
477                 suffix = 'bkMGTPEZY'[exponent]
478                 converted = float(bytes) / float(1024 ** exponent)
479                 return '%.2f%s' % (converted, suffix)
480
481         @staticmethod
482         def calc_percent(byte_counter, data_len):
483                 if data_len is None:
484                         return '---.-%'
485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
487         @staticmethod
488         def calc_eta(start, now, total, current):
489                 if total is None:
490                         return '--:--'
491                 dif = now - start
492                 if current == 0 or dif < 0.001: # One millisecond
493                         return '--:--'
494                 rate = float(current) / dif
495                 eta = long((float(total) - float(current)) / rate)
496                 (eta_mins, eta_secs) = divmod(eta, 60)
497                 if eta_mins > 99:
498                         return '--:--'
499                 return '%02d:%02d' % (eta_mins, eta_secs)
500
501         @staticmethod
502         def calc_speed(start, now, bytes):
503                 dif = now - start
504                 if bytes == 0 or dif < 0.001: # One millisecond
505                         return '%10s' % '---b/s'
506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
508         @staticmethod
509         def best_block_size(elapsed_time, bytes):
510                 new_min = max(bytes / 2.0, 1.0)
511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512                 if elapsed_time < 0.001:
513                         return long(new_max)
514                 rate = bytes / elapsed_time
515                 if rate > new_max:
516                         return long(new_max)
517                 if rate < new_min:
518                         return long(new_min)
519                 return long(rate)
520
521         @staticmethod
522         def parse_bytes(bytestr):
523                 """Parse a string indicating a byte quantity into a long integer."""
524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525                 if matchobj is None:
526                         return None
527                 number = float(matchobj.group(1))
528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529                 return long(round(number * multiplier))
530
531         def add_info_extractor(self, ie):
532                 """Add an InfoExtractor object to the end of the list."""
533                 self._ies.append(ie)
534                 ie.set_downloader(self)
535
536         def add_post_processor(self, pp):
537                 """Add a PostProcessor object to the end of the chain."""
538                 self._pps.append(pp)
539                 pp.set_downloader(self)
540
541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542                 """Print message to stdout if not in quiet mode."""
543                 try:
544                         if not self.params.get('quiet', False):
545                                 terminator = [u'\n', u''][skip_eol]
546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547                         self._screen_file.flush()
548                 except (UnicodeEncodeError), err:
549                         if not ignore_encoding_errors:
550                                 raise
551
552         def to_stderr(self, message):
553                 """Print message to stderr."""
554                 print >>sys.stderr, message.encode(preferredencoding())
555
556         def to_cons_title(self, message):
557                 """Set console/terminal window title to message."""
558                 if not self.params.get('consoletitle', False):
559                         return
560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561                         # c_wchar_p() might not be necessary if `message` is
562                         # already of type unicode()
563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564                 elif 'TERM' in os.environ:
565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
567         def fixed_template(self):
568                 """Checks if the output template is fixed."""
569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
570
571         def trouble(self, message=None):
572                 """Determine action to take when a download problem appears.
573
574                 Depending on if the downloader has been configured to ignore
575                 download errors or not, this method may throw an exception or
576                 not when errors are found, after printing the message.
577                 """
578                 if message is not None:
579                         self.to_stderr(message)
580                 if not self.params.get('ignoreerrors', False):
581                         raise DownloadError(message)
582                 self._download_retcode = 1
583
584         def slow_down(self, start_time, byte_counter):
585                 """Sleep if the download speed is over the rate limit."""
586                 rate_limit = self.params.get('ratelimit', None)
587                 if rate_limit is None or byte_counter == 0:
588                         return
589                 now = time.time()
590                 elapsed = now - start_time
591                 if elapsed <= 0.0:
592                         return
593                 speed = float(byte_counter) / elapsed
594                 if speed > rate_limit:
595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
596
597         def temp_name(self, filename):
598                 """Returns a temporary filename for the given filename."""
599                 if self.params.get('nopart', False) or filename == u'-' or \
600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
601                         return filename
602                 return filename + u'.part'
603
604         def undo_temp_name(self, filename):
605                 if filename.endswith(u'.part'):
606                         return filename[:-len(u'.part')]
607                 return filename
608
609         def try_rename(self, old_filename, new_filename):
610                 try:
611                         if old_filename == new_filename:
612                                 return
613                         os.rename(old_filename, new_filename)
614                 except (IOError, OSError), err:
615                         self.trouble(u'ERROR: unable to rename file')
616
617         def try_utime(self, filename, last_modified_hdr):
618                 """Try to set the last-modified time of the given file."""
619                 if last_modified_hdr is None:
620                         return
621                 if not os.path.isfile(filename):
622                         return
623                 timestr = last_modified_hdr
624                 if timestr is None:
625                         return
626                 filetime = timeconvert(timestr)
627                 if filetime is None:
628                         return
629                 try:
630                         os.utime(filename, (time.time(), filetime))
631                 except:
632                         pass
633
634         def report_writedescription(self, descfn):
635                 """ Report that the description file is being written """
636                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
637
638         def report_writeinfojson(self, infofn):
639                 """ Report that the metadata file has been written """
640                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
641
642         def report_destination(self, filename):
643                 """Report destination filename."""
644                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
645
646         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
647                 """Report download progress."""
648                 if self.params.get('noprogress', False):
649                         return
650                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
651                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
652                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
653                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
654
655         def report_resuming_byte(self, resume_len):
656                 """Report attempt to resume at given byte."""
657                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
658
659         def report_retry(self, count, retries):
660                 """Report retry in case of HTTP error 5xx"""
661                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
662
663         def report_file_already_downloaded(self, file_name):
664                 """Report file has already been fully downloaded."""
665                 try:
666                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
667                 except (UnicodeEncodeError), err:
668                         self.to_screen(u'[download] The file has already been downloaded')
669
670         def report_unable_to_resume(self):
671                 """Report it was impossible to resume download."""
672                 self.to_screen(u'[download] Unable to resume')
673
674         def report_finish(self):
675                 """Report download finished."""
676                 if self.params.get('noprogress', False):
677                         self.to_screen(u'[download] Download completed')
678                 else:
679                         self.to_screen(u'')
680
681         def increment_downloads(self):
682                 """Increment the ordinal that assigns a number to each file."""
683                 self._num_downloads += 1
684
685         def prepare_filename(self, info_dict):
686                 """Generate the output filename."""
687                 try:
688                         template_dict = dict(info_dict)
689                         template_dict['epoch'] = unicode(long(time.time()))
690                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
691                         filename = self.params['outtmpl'] % template_dict
692                         return filename
693                 except (ValueError, KeyError), err:
694                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
695                         return None
696
697         def process_info(self, info_dict):
698                 """Process a single dictionary returned by an InfoExtractor."""
699                 filename = self.prepare_filename(info_dict)
700                 # Do nothing else if in simulate mode
701                 if self.params.get('simulate', False):
702                         # Forced printings
703                         if self.params.get('forcetitle', False):
704                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705                         if self.params.get('forceurl', False):
706                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709                         if self.params.get('forcedescription', False) and 'description' in info_dict:
710                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711                         if self.params.get('forcefilename', False) and filename is not None:
712                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713
714                         return
715
716                 if filename is None:
717                         return
718
719                 matchtitle=self.params.get('matchtitle',False)
720                 rejecttitle=self.params.get('rejecttitle',False)
721                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
722                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
723                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
724                         return
725                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
726                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
727                         return
728                         
729                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
730                         self.to_stderr(u'WARNING: file exists and will be skipped')
731                         return
732
733                 try:
734                         dn = os.path.dirname(filename)
735                         if dn != '' and not os.path.exists(dn):
736                                 os.makedirs(dn)
737                 except (OSError, IOError), err:
738                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
739                         return
740
741                 if self.params.get('writedescription', False):
742                         try:
743                                 descfn = filename + '.description'
744                                 self.report_writedescription(descfn)
745                                 descfile = open(descfn, 'wb')
746                                 try:
747                                         descfile.write(info_dict['description'].encode('utf-8'))
748                                 finally:
749                                         descfile.close()
750                         except (OSError, IOError):
751                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
752                                 return
753
754                 if self.params.get('writeinfojson', False):
755                         infofn = filename + '.info.json'
756                         self.report_writeinfojson(infofn)
757                         try:
758                                 json.dump
759                         except (NameError,AttributeError):
760                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
761                                 return
762                         try:
763                                 infof = open(infofn, 'wb')
764                                 try:
765                                         json.dump(info_dict, infof)
766                                 finally:
767                                         infof.close()
768                         except (OSError, IOError):
769                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
770                                 return
771
772                 try:
773                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
774                 except (OSError, IOError), err:
775                         raise UnavailableVideoError
776                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
777                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
778                         return
779                 except (ContentTooShortError, ), err:
780                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
781                         return
782
783                 if success:
784                         try:
785                                 self.post_process(filename, info_dict)
786                         except (PostProcessingError), err:
787                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
788                                 return
789
790         def download(self, url_list):
791                 """Download a given list of URLs."""
792                 if len(url_list) > 1 and self.fixed_template():
793                         raise SameFileError(self.params['outtmpl'])
794
795                 for url in url_list:
796                         suitable_found = False
797                         for ie in self._ies:
798                                 # Go to next InfoExtractor if not suitable
799                                 if not ie.suitable(url):
800                                         continue
801
802                                 # Suitable InfoExtractor found
803                                 suitable_found = True
804
805                                 # Extract information from URL and process it
806                                 ie.extract(url)
807
808                                 # Suitable InfoExtractor had been found; go to next URL
809                                 break
810
811                         if not suitable_found:
812                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
813
814                 return self._download_retcode
815
816         def post_process(self, filename, ie_info):
817                 """Run the postprocessing chain on the given file."""
818                 info = dict(ie_info)
819                 info['filepath'] = filename
820                 for pp in self._pps:
821                         info = pp.run(info)
822                         if info is None:
823                                 break
824
825         def _download_with_rtmpdump(self, filename, url, player_url):
826                 self.report_destination(filename)
827                 tmpfilename = self.temp_name(filename)
828
829                 # Check for rtmpdump first
830                 try:
831                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
832                 except (OSError, IOError):
833                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
834                         return False
835
836                 # Download using rtmpdump. rtmpdump returns exit code 2 when
837                 # the connection was interrumpted and resuming appears to be
838                 # possible. This is part of rtmpdump's normal usage, AFAIK.
839                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
840                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
841                 while retval == 2 or retval == 1:
842                         prevsize = os.path.getsize(tmpfilename)
843                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
844                         time.sleep(5.0) # This seems to be needed
845                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
846                         cursize = os.path.getsize(tmpfilename)
847                         if prevsize == cursize and retval == 1:
848                                 break
849                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
850                         if prevsize == cursize and retval == 2 and cursize > 1024:
851                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
852                                 retval = 0
853                                 break
854                 if retval == 0:
855                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
856                         self.try_rename(tmpfilename, filename)
857                         return True
858                 else:
859                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
860                         return False
861
862         def _do_download(self, filename, url, player_url):
863                 # Check file already present
864                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
865                         self.report_file_already_downloaded(filename)
866                         return True
867
868                 # Attempt to download using rtmpdump
869                 if url.startswith('rtmp'):
870                         return self._download_with_rtmpdump(filename, url, player_url)
871
872                 tmpfilename = self.temp_name(filename)
873                 stream = None
874                 open_mode = 'wb'
875
876                 # Do not include the Accept-Encoding header
877                 headers = {'Youtubedl-no-compression': 'True'}
878                 basic_request = urllib2.Request(url, None, headers)
879                 request = urllib2.Request(url, None, headers)
880
881                 # Establish possible resume length
882                 if os.path.isfile(tmpfilename):
883                         resume_len = os.path.getsize(tmpfilename)
884                 else:
885                         resume_len = 0
886
887                 # Request parameters in case of being able to resume
888                 if self.params.get('continuedl', False) and resume_len != 0:
889                         self.report_resuming_byte(resume_len)
890                         request.add_header('Range', 'bytes=%d-' % resume_len)
891                         open_mode = 'ab'
892
893                 count = 0
894                 retries = self.params.get('retries', 0)
895                 while count <= retries:
896                         # Establish connection
897                         try:
898                                 data = urllib2.urlopen(request)
899                                 break
900                         except (urllib2.HTTPError, ), err:
901                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
902                                         # Unexpected HTTP error
903                                         raise
904                                 elif err.code == 416:
905                                         # Unable to resume (requested range not satisfiable)
906                                         try:
907                                                 # Open the connection again without the range header
908                                                 data = urllib2.urlopen(basic_request)
909                                                 content_length = data.info()['Content-Length']
910                                         except (urllib2.HTTPError, ), err:
911                                                 if err.code < 500 or err.code >= 600:
912                                                         raise
913                                         else:
914                                                 # Examine the reported length
915                                                 if (content_length is not None and
916                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
917                                                         # The file had already been fully downloaded.
918                                                         # Explanation to the above condition: in issue #175 it was revealed that
919                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
920                                                         # changing the file size slightly and causing problems for some users. So
921                                                         # I decided to implement a suggested change and consider the file
922                                                         # completely downloaded if the file size differs less than 100 bytes from
923                                                         # the one in the hard drive.
924                                                         self.report_file_already_downloaded(filename)
925                                                         self.try_rename(tmpfilename, filename)
926                                                         return True
927                                                 else:
928                                                         # The length does not match, we start the download over
929                                                         self.report_unable_to_resume()
930                                                         open_mode = 'wb'
931                                                         break
932                         # Retry
933                         count += 1
934                         if count <= retries:
935                                 self.report_retry(count, retries)
936
937                 if count > retries:
938                         self.trouble(u'ERROR: giving up after %s retries' % retries)
939                         return False
940
941                 data_len = data.info().get('Content-length', None)
942                 if data_len is not None:
943                         data_len = long(data_len) + resume_len
944                 data_len_str = self.format_bytes(data_len)
945                 byte_counter = 0 + resume_len
946                 block_size = 1024
947                 start = time.time()
948                 while True:
949                         # Download and write
950                         before = time.time()
951                         data_block = data.read(block_size)
952                         after = time.time()
953                         if len(data_block) == 0:
954                                 break
955                         byte_counter += len(data_block)
956
957                         # Open file just in time
958                         if stream is None:
959                                 try:
960                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
961                                         assert stream is not None
962                                         filename = self.undo_temp_name(tmpfilename)
963                                         self.report_destination(filename)
964                                 except (OSError, IOError), err:
965                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
966                                         return False
967                         try:
968                                 stream.write(data_block)
969                         except (IOError, OSError), err:
970                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
971                                 return False
972                         block_size = self.best_block_size(after - before, len(data_block))
973
974                         # Progress message
975                         percent_str = self.calc_percent(byte_counter, data_len)
976                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
977                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
978                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
979
980                         # Apply rate limit
981                         self.slow_down(start, byte_counter - resume_len)
982
983                 if stream is None:
984                         self.trouble(u'\nERROR: Did not get any data blocks')
985                         return False
986                 stream.close()
987                 self.report_finish()
988                 if data_len is not None and byte_counter != data_len:
989                         raise ContentTooShortError(byte_counter, long(data_len))
990                 self.try_rename(tmpfilename, filename)
991
992                 # Update file modification time
993                 if self.params.get('updatetime', True):
994                         self.try_utime(filename, data.info().get('last-modified', None))
995
996                 return True
997
998
999 class InfoExtractor(object):
1000         """Information Extractor class.
1001
1002         Information extractors are the classes that, given a URL, extract
1003         information from the video (or videos) the URL refers to. This
1004         information includes the real video URL, the video title and simplified
1005         title, author and others. The information is stored in a dictionary
1006         which is then passed to the FileDownloader. The FileDownloader
1007         processes this information possibly downloading the video to the file
1008         system, among other possible outcomes. The dictionaries must include
1009         the following fields:
1010
1011         id:             Video identifier.
1012         url:            Final video URL.
1013         uploader:       Nickname of the video uploader.
1014         title:          Literal title.
1015         stitle:         Simplified title.
1016         ext:            Video filename extension.
1017         format:         Video format.
1018         player_url:     SWF Player URL (may be None).
1019
1020         The following fields are optional. Their primary purpose is to allow
1021         youtube-dl to serve as the backend for a video search function, such
1022         as the one in youtube2mp3.  They are only used when their respective
1023         forced printing functions are called:
1024
1025         thumbnail:      Full URL to a video thumbnail image.
1026         description:    One-line video description.
1027
1028         Subclasses of this one should re-define the _real_initialize() and
1029         _real_extract() methods and define a _VALID_URL regexp.
1030         Probably, they should also be added to the list of extractors.
1031         """
1032
1033         _ready = False
1034         _downloader = None
1035
1036         def __init__(self, downloader=None):
1037                 """Constructor. Receives an optional downloader."""
1038                 self._ready = False
1039                 self.set_downloader(downloader)
1040
1041         def suitable(self, url):
1042                 """Receives a URL and returns True if suitable for this IE."""
1043                 return re.match(self._VALID_URL, url) is not None
1044
1045         def initialize(self):
1046                 """Initializes an instance (authentication, etc)."""
1047                 if not self._ready:
1048                         self._real_initialize()
1049                         self._ready = True
1050
1051         def extract(self, url):
1052                 """Extracts URL information and returns it in list of dicts."""
1053                 self.initialize()
1054                 return self._real_extract(url)
1055
1056         def set_downloader(self, downloader):
1057                 """Sets the downloader for this IE."""
1058                 self._downloader = downloader
1059
1060         def _real_initialize(self):
1061                 """Real initialization process. Redefine in subclasses."""
1062                 pass
1063
1064         def _real_extract(self, url):
1065                 """Real extraction process. Redefine in subclasses."""
1066                 pass
1067
1068
1069 class YoutubeIE(InfoExtractor):
1070         """Information extractor for youtube.com."""
1071
1072         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1073         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1074         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1075         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1076         _NETRC_MACHINE = 'youtube'
1077         # Listed in order of quality
1078         _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1079         _video_extensions = {
1080                 '13': '3gp',
1081                 '17': 'mp4',
1082                 '18': 'mp4',
1083                 '22': 'mp4',
1084                 '37': 'mp4',
1085                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1086                 '43': 'webm',
1087                 '45': 'webm',
1088         }
1089
1090         def report_lang(self):
1091                 """Report attempt to set language."""
1092                 self._downloader.to_screen(u'[youtube] Setting language')
1093
1094         def report_login(self):
1095                 """Report attempt to log in."""
1096                 self._downloader.to_screen(u'[youtube] Logging in')
1097
1098         def report_age_confirmation(self):
1099                 """Report attempt to confirm age."""
1100                 self._downloader.to_screen(u'[youtube] Confirming age')
1101
1102         def report_video_webpage_download(self, video_id):
1103                 """Report attempt to download video webpage."""
1104                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1105
1106         def report_video_info_webpage_download(self, video_id):
1107                 """Report attempt to download video info webpage."""
1108                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1109
1110         def report_information_extraction(self, video_id):
1111                 """Report attempt to extract video information."""
1112                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1113
1114         def report_unavailable_format(self, video_id, format):
1115                 """Report extracted video URL."""
1116                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1117
1118         def report_rtmp_download(self):
1119                 """Indicate the download will use the RTMP protocol."""
1120                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1121
1122         def _real_initialize(self):
1123                 if self._downloader is None:
1124                         return
1125
1126                 username = None
1127                 password = None
1128                 downloader_params = self._downloader.params
1129
1130                 # Attempt to use provided username and password or .netrc data
1131                 if downloader_params.get('username', None) is not None:
1132                         username = downloader_params['username']
1133                         password = downloader_params['password']
1134                 elif downloader_params.get('usenetrc', False):
1135                         try:
1136                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1137                                 if info is not None:
1138                                         username = info[0]
1139                                         password = info[2]
1140                                 else:
1141                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1142                         except (IOError, netrc.NetrcParseError), err:
1143                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1144                                 return
1145
1146                 # Set language
1147                 request = urllib2.Request(self._LANG_URL)
1148                 try:
1149                         self.report_lang()
1150                         urllib2.urlopen(request).read()
1151                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1152                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1153                         return
1154
1155                 # No authentication to be performed
1156                 if username is None:
1157                         return
1158
1159                 # Log in
1160                 login_form = {
1161                                 'current_form': 'loginForm',
1162                                 'next':         '/',
1163                                 'action_login': 'Log In',
1164                                 'username':     username,
1165                                 'password':     password,
1166                                 }
1167                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1168                 try:
1169                         self.report_login()
1170                         login_results = urllib2.urlopen(request).read()
1171                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1172                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1173                                 return
1174                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1175                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1176                         return
1177
1178                 # Confirm age
1179                 age_form = {
1180                                 'next_url':             '/',
1181                                 'action_confirm':       'Confirm',
1182                                 }
1183                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1184                 try:
1185                         self.report_age_confirmation()
1186                         age_results = urllib2.urlopen(request).read()
1187                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1188                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1189                         return
1190
1191         def _real_extract(self, url):
1192                 # Extract video id from URL
1193                 mobj = re.match(self._VALID_URL, url)
1194                 if mobj is None:
1195                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1196                         return
1197                 video_id = mobj.group(2)
1198
1199                 # Get video webpage
1200                 self.report_video_webpage_download(video_id)
1201                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1202                 try:
1203                         video_webpage = urllib2.urlopen(request).read()
1204                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1205                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1206                         return
1207
1208                 # Attempt to extract SWF player URL
1209                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1210                 if mobj is not None:
1211                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1212                 else:
1213                         player_url = None
1214
1215                 # Get video info
1216                 self.report_video_info_webpage_download(video_id)
1217                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1218                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1219                                         % (video_id, el_type))
1220                         request = urllib2.Request(video_info_url)
1221                         try:
1222                                 video_info_webpage = urllib2.urlopen(request).read()
1223                                 video_info = parse_qs(video_info_webpage)
1224                                 if 'token' in video_info:
1225                                         break
1226                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1227                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1228                                 return
1229                 if 'token' not in video_info:
1230                         if 'reason' in video_info:
1231                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1232                         else:
1233                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1234                         return
1235
1236                 # Start extracting information
1237                 self.report_information_extraction(video_id)
1238
1239                 # uploader
1240                 if 'author' not in video_info:
1241                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1242                         return
1243                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1244
1245                 # title
1246                 if 'title' not in video_info:
1247                         self._downloader.trouble(u'ERROR: unable to extract video title')
1248                         return
1249                 video_title = urllib.unquote_plus(video_info['title'][0])
1250                 video_title = video_title.decode('utf-8')
1251                 video_title = sanitize_title(video_title)
1252
1253                 # simplified title
1254                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1255                 simple_title = simple_title.strip(ur'_')
1256
1257                 # thumbnail image
1258                 if 'thumbnail_url' not in video_info:
1259                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1260                         video_thumbnail = ''
1261                 else:   # don't panic if we can't find it
1262                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1263
1264                 # upload date
1265                 upload_date = u'NA'
1266                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1267                 if mobj is not None:
1268                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1269                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1270                         for expression in format_expressions:
1271                                 try:
1272                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1273                                 except:
1274                                         pass
1275
1276                 # description
1277                 try:
1278                         lxml.etree
1279                 except NameError:
1280                         video_description = u'No description available.'
1281                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1282                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1283                                 if mobj is not None:
1284                                         video_description = mobj.group(1).decode('utf-8')
1285                 else:
1286                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1287                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1288                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1289                         # TODO use another parser
1290
1291                 # token
1292                 video_token = urllib.unquote_plus(video_info['token'][0])
1293
1294                 # Decide which formats to download
1295                 req_format = self._downloader.params.get('format', None)
1296
1297                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1298                         self.report_rtmp_download()
1299                         video_url_list = [(None, video_info['conn'][0])]
1300                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1301                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1302                         url_data = [parse_qs(uds) for uds in url_data_strs]
1303                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1304                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1305
1306                         format_limit = self._downloader.params.get('format_limit', None)
1307                         if format_limit is not None and format_limit in self._available_formats:
1308                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1309                         else:
1310                                 format_list = self._available_formats
1311                         existing_formats = [x for x in format_list if x in url_map]
1312                         if len(existing_formats) == 0:
1313                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1314                                 return
1315                         if req_format is None:
1316                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1317                         elif req_format == '-1':
1318                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1319                         else:
1320                                 # Specific format
1321                                 if req_format not in url_map:
1322                                         self._downloader.trouble(u'ERROR: requested format not available')
1323                                         return
1324                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1325                 else:
1326                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1327                         return
1328
1329                 for format_param, video_real_url in video_url_list:
1330                         # At this point we have a new video
1331                         self._downloader.increment_downloads()
1332
1333                         # Extension
1334                         video_extension = self._video_extensions.get(format_param, 'flv')
1335
1336                         try:
1337                                 # Process video information
1338                                 self._downloader.process_info({
1339                                         'id':           video_id.decode('utf-8'),
1340                                         'url':          video_real_url.decode('utf-8'),
1341                                         'uploader':     video_uploader.decode('utf-8'),
1342                                         'upload_date':  upload_date,
1343                                         'title':        video_title,
1344                                         'stitle':       simple_title,
1345                                         'ext':          video_extension.decode('utf-8'),
1346                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1347                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1348                                         'description':  video_description,
1349                                         'player_url':   player_url,
1350                                 })
1351                         except UnavailableVideoError, err:
1352                                 self._downloader.trouble(u'\nERROR: unable to download video')
1353
1354
1355 class MetacafeIE(InfoExtractor):
1356         """Information Extractor for metacafe.com."""
1357
1358         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1359         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1360         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1361         _youtube_ie = None
1362
1363         def __init__(self, youtube_ie, downloader=None):
1364                 InfoExtractor.__init__(self, downloader)
1365                 self._youtube_ie = youtube_ie
1366
1367         def report_disclaimer(self):
1368                 """Report disclaimer retrieval."""
1369                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1370
1371         def report_age_confirmation(self):
1372                 """Report attempt to confirm age."""
1373                 self._downloader.to_screen(u'[metacafe] Confirming age')
1374
1375         def report_download_webpage(self, video_id):
1376                 """Report webpage download."""
1377                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1378
1379         def report_extraction(self, video_id):
1380                 """Report information extraction."""
1381                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1382
1383         def _real_initialize(self):
1384                 # Retrieve disclaimer
1385                 request = urllib2.Request(self._DISCLAIMER)
1386                 try:
1387                         self.report_disclaimer()
1388                         disclaimer = urllib2.urlopen(request).read()
1389                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1390                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1391                         return
1392
1393                 # Confirm age
1394                 disclaimer_form = {
1395                         'filters': '0',
1396                         'submit': "Continue - I'm over 18",
1397                         }
1398                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1399                 try:
1400                         self.report_age_confirmation()
1401                         disclaimer = urllib2.urlopen(request).read()
1402                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1403                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1404                         return
1405
1406         def _real_extract(self, url):
1407                 # Extract id and simplified title from URL
1408                 mobj = re.match(self._VALID_URL, url)
1409                 if mobj is None:
1410                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1411                         return
1412
1413                 video_id = mobj.group(1)
1414
1415                 # Check if video comes from YouTube
1416                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1417                 if mobj2 is not None:
1418                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1419                         return
1420
1421                 # At this point we have a new video
1422                 self._downloader.increment_downloads()
1423
1424                 simple_title = mobj.group(2).decode('utf-8')
1425
1426                 # Retrieve video webpage to extract further information
1427                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1428                 try:
1429                         self.report_download_webpage(video_id)
1430                         webpage = urllib2.urlopen(request).read()
1431                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1432                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1433                         return
1434
1435                 # Extract URL, uploader and title from webpage
1436                 self.report_extraction(video_id)
1437                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1438                 if mobj is not None:
1439                         mediaURL = urllib.unquote(mobj.group(1))
1440                         video_extension = mediaURL[-3:]
1441
1442                         # Extract gdaKey if available
1443                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1444                         if mobj is None:
1445                                 video_url = mediaURL
1446                         else:
1447                                 gdaKey = mobj.group(1)
1448                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1449                 else:
1450                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1451                         if mobj is None:
1452                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1453                                 return
1454                         vardict = parse_qs(mobj.group(1))
1455                         if 'mediaData' not in vardict:
1456                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1457                                 return
1458                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1459                         if mobj is None:
1460                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1461                                 return
1462                         mediaURL = mobj.group(1).replace('\\/', '/')
1463                         video_extension = mediaURL[-3:]
1464                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1465
1466                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1467                 if mobj is None:
1468                         self._downloader.trouble(u'ERROR: unable to extract title')
1469                         return
1470                 video_title = mobj.group(1).decode('utf-8')
1471                 video_title = sanitize_title(video_title)
1472
1473                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1474                 if mobj is None:
1475                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1476                         return
1477                 video_uploader = mobj.group(1)
1478
1479                 try:
1480                         # Process video information
1481                         self._downloader.process_info({
1482                                 'id':           video_id.decode('utf-8'),
1483                                 'url':          video_url.decode('utf-8'),
1484                                 'uploader':     video_uploader.decode('utf-8'),
1485                                 'upload_date':  u'NA',
1486                                 'title':        video_title,
1487                                 'stitle':       simple_title,
1488                                 'ext':          video_extension.decode('utf-8'),
1489                                 'format':       u'NA',
1490                                 'player_url':   None,
1491                         })
1492                 except UnavailableVideoError:
1493                         self._downloader.trouble(u'\nERROR: unable to download video')
1494
1495
1496 class DailymotionIE(InfoExtractor):
1497         """Information Extractor for Dailymotion"""
1498
1499         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1500
1501         def __init__(self, downloader=None):
1502                 InfoExtractor.__init__(self, downloader)
1503
1504         def report_download_webpage(self, video_id):
1505                 """Report webpage download."""
1506                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1507
1508         def report_extraction(self, video_id):
1509                 """Report information extraction."""
1510                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1511
1512         def _real_initialize(self):
1513                 return
1514
1515         def _real_extract(self, url):
1516                 # Extract id and simplified title from URL
1517                 mobj = re.match(self._VALID_URL, url)
1518                 if mobj is None:
1519                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1520                         return
1521
1522                 # At this point we have a new video
1523                 self._downloader.increment_downloads()
1524                 video_id = mobj.group(1)
1525
1526                 simple_title = mobj.group(2).decode('utf-8')
1527                 video_extension = 'flv'
1528
1529                 # Retrieve video webpage to extract further information
1530                 request = urllib2.Request(url)
1531                 request.add_header('Cookie', 'family_filter=off')
1532                 try:
1533                         self.report_download_webpage(video_id)
1534                         webpage = urllib2.urlopen(request).read()
1535                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1536                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1537                         return
1538
1539                 # Extract URL, uploader and title from webpage
1540                 self.report_extraction(video_id)
1541                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1542                 if mobj is None:
1543                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1544                         return
1545                 sequence = urllib.unquote(mobj.group(1))
1546                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1547                 if mobj is None:
1548                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1549                         return
1550                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1551
1552                 # if needed add http://www.dailymotion.com/ if relative URL
1553
1554                 video_url = mediaURL
1555
1556                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1557                 if mobj is None:
1558                         self._downloader.trouble(u'ERROR: unable to extract title')
1559                         return
1560                 video_title = mobj.group(1).decode('utf-8')
1561                 video_title = sanitize_title(video_title)
1562
1563                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1564                 if mobj is None:
1565                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1566                         return
1567                 video_uploader = mobj.group(1)
1568
1569                 try:
1570                         # Process video information
1571                         self._downloader.process_info({
1572                                 'id':           video_id.decode('utf-8'),
1573                                 'url':          video_url.decode('utf-8'),
1574                                 'uploader':     video_uploader.decode('utf-8'),
1575                                 'upload_date':  u'NA',
1576                                 'title':        video_title,
1577                                 'stitle':       simple_title,
1578                                 'ext':          video_extension.decode('utf-8'),
1579                                 'format':       u'NA',
1580                                 'player_url':   None,
1581                         })
1582                 except UnavailableVideoError:
1583                         self._downloader.trouble(u'\nERROR: unable to download video')
1584
1585
1586 class GoogleIE(InfoExtractor):
1587         """Information extractor for video.google.com."""
1588
1589         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1590
1591         def __init__(self, downloader=None):
1592                 InfoExtractor.__init__(self, downloader)
1593
1594         def report_download_webpage(self, video_id):
1595                 """Report webpage download."""
1596                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1597
1598         def report_extraction(self, video_id):
1599                 """Report information extraction."""
1600                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1601
1602         def _real_initialize(self):
1603                 return
1604
1605         def _real_extract(self, url):
1606                 # Extract id from URL
1607                 mobj = re.match(self._VALID_URL, url)
1608                 if mobj is None:
1609                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1610                         return
1611
1612                 # At this point we have a new video
1613                 self._downloader.increment_downloads()
1614                 video_id = mobj.group(1)
1615
1616                 video_extension = 'mp4'
1617
1618                 # Retrieve video webpage to extract further information
1619                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1620                 try:
1621                         self.report_download_webpage(video_id)
1622                         webpage = urllib2.urlopen(request).read()
1623                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1624                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1625                         return
1626
1627                 # Extract URL, uploader, and title from webpage
1628                 self.report_extraction(video_id)
1629                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1630                 if mobj is None:
1631                         video_extension = 'flv'
1632                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1633                 if mobj is None:
1634                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1635                         return
1636                 mediaURL = urllib.unquote(mobj.group(1))
1637                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1638                 mediaURL = mediaURL.replace('\\x26', '\x26')
1639
1640                 video_url = mediaURL
1641
1642                 mobj = re.search(r'<title>(.*)</title>', webpage)
1643                 if mobj is None:
1644                         self._downloader.trouble(u'ERROR: unable to extract title')
1645                         return
1646                 video_title = mobj.group(1).decode('utf-8')
1647                 video_title = sanitize_title(video_title)
1648                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1649
1650                 # Extract video description
1651                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1652                 if mobj is None:
1653                         self._downloader.trouble(u'ERROR: unable to extract video description')
1654                         return
1655                 video_description = mobj.group(1).decode('utf-8')
1656                 if not video_description:
1657                         video_description = 'No description available.'
1658
1659                 # Extract video thumbnail
1660                 if self._downloader.params.get('forcethumbnail', False):
1661                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1662                         try:
1663                                 webpage = urllib2.urlopen(request).read()
1664                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1665                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1666                                 return
1667                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1668                         if mobj is None:
1669                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1670                                 return
1671                         video_thumbnail = mobj.group(1)
1672                 else:   # we need something to pass to process_info
1673                         video_thumbnail = ''
1674
1675                 try:
1676                         # Process video information
1677                         self._downloader.process_info({
1678                                 'id':           video_id.decode('utf-8'),
1679                                 'url':          video_url.decode('utf-8'),
1680                                 'uploader':     u'NA',
1681                                 'upload_date':  u'NA',
1682                                 'title':        video_title,
1683                                 'stitle':       simple_title,
1684                                 'ext':          video_extension.decode('utf-8'),
1685                                 'format':       u'NA',
1686                                 'player_url':   None,
1687                         })
1688                 except UnavailableVideoError:
1689                         self._downloader.trouble(u'\nERROR: unable to download video')
1690
1691
1692 class PhotobucketIE(InfoExtractor):
1693         """Information extractor for photobucket.com."""
1694
1695         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1696
1697         def __init__(self, downloader=None):
1698                 InfoExtractor.__init__(self, downloader)
1699
1700         def report_download_webpage(self, video_id):
1701                 """Report webpage download."""
1702                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1703
1704         def report_extraction(self, video_id):
1705                 """Report information extraction."""
1706                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1707
1708         def _real_initialize(self):
1709                 return
1710
1711         def _real_extract(self, url):
1712                 # Extract id from URL
1713                 mobj = re.match(self._VALID_URL, url)
1714                 if mobj is None:
1715                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1716                         return
1717
1718                 # At this point we have a new video
1719                 self._downloader.increment_downloads()
1720                 video_id = mobj.group(1)
1721
1722                 video_extension = 'flv'
1723
1724                 # Retrieve video webpage to extract further information
1725                 request = urllib2.Request(url)
1726                 try:
1727                         self.report_download_webpage(video_id)
1728                         webpage = urllib2.urlopen(request).read()
1729                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1730                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1731                         return
1732
1733                 # Extract URL, uploader, and title from webpage
1734                 self.report_extraction(video_id)
1735                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1736                 if mobj is None:
1737                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1738                         return
1739                 mediaURL = urllib.unquote(mobj.group(1))
1740
1741                 video_url = mediaURL
1742
1743                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1744                 if mobj is None:
1745                         self._downloader.trouble(u'ERROR: unable to extract title')
1746                         return
1747                 video_title = mobj.group(1).decode('utf-8')
1748                 video_title = sanitize_title(video_title)
1749                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1750
1751                 video_uploader = mobj.group(2).decode('utf-8')
1752
1753                 try:
1754                         # Process video information
1755                         self._downloader.process_info({
1756                                 'id':           video_id.decode('utf-8'),
1757                                 'url':          video_url.decode('utf-8'),
1758                                 'uploader':     video_uploader,
1759                                 'upload_date':  u'NA',
1760                                 'title':        video_title,
1761                                 'stitle':       simple_title,
1762                                 'ext':          video_extension.decode('utf-8'),
1763                                 'format':       u'NA',
1764                                 'player_url':   None,
1765                         })
1766                 except UnavailableVideoError:
1767                         self._downloader.trouble(u'\nERROR: unable to download video')
1768
1769
1770 class YahooIE(InfoExtractor):
1771         """Information extractor for video.yahoo.com."""
1772
1773         # _VALID_URL matches all Yahoo! Video URLs
1774         # _VPAGE_URL matches only the extractable '/watch/' URLs
1775         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1776         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1777
1778         def __init__(self, downloader=None):
1779                 InfoExtractor.__init__(self, downloader)
1780
1781         def report_download_webpage(self, video_id):
1782                 """Report webpage download."""
1783                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1784
1785         def report_extraction(self, video_id):
1786                 """Report information extraction."""
1787                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1788
1789         def _real_initialize(self):
1790                 return
1791
1792         def _real_extract(self, url, new_video=True):
1793                 # Extract ID from URL
1794                 mobj = re.match(self._VALID_URL, url)
1795                 if mobj is None:
1796                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1797                         return
1798
1799                 # At this point we have a new video
1800                 self._downloader.increment_downloads()
1801                 video_id = mobj.group(2)
1802                 video_extension = 'flv'
1803
1804                 # Rewrite valid but non-extractable URLs as
1805                 # extractable English language /watch/ URLs
1806                 if re.match(self._VPAGE_URL, url) is None:
1807                         request = urllib2.Request(url)
1808                         try:
1809                                 webpage = urllib2.urlopen(request).read()
1810                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1811                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1812                                 return
1813
1814                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1815                         if mobj is None:
1816                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1817                                 return
1818                         yahoo_id = mobj.group(1)
1819
1820                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1821                         if mobj is None:
1822                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1823                                 return
1824                         yahoo_vid = mobj.group(1)
1825
1826                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1827                         return self._real_extract(url, new_video=False)
1828
1829                 # Retrieve video webpage to extract further information
1830                 request = urllib2.Request(url)
1831                 try:
1832                         self.report_download_webpage(video_id)
1833                         webpage = urllib2.urlopen(request).read()
1834                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1835                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1836                         return
1837
1838                 # Extract uploader and title from webpage
1839                 self.report_extraction(video_id)
1840                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1841                 if mobj is None:
1842                         self._downloader.trouble(u'ERROR: unable to extract video title')
1843                         return
1844                 video_title = mobj.group(1).decode('utf-8')
1845                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1846
1847                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1848                 if mobj is None:
1849                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1850                         return
1851                 video_uploader = mobj.group(1).decode('utf-8')
1852
1853                 # Extract video thumbnail
1854                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1855                 if mobj is None:
1856                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1857                         return
1858                 video_thumbnail = mobj.group(1).decode('utf-8')
1859
1860                 # Extract video description
1861                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1862                 if mobj is None:
1863                         self._downloader.trouble(u'ERROR: unable to extract video description')
1864                         return
1865                 video_description = mobj.group(1).decode('utf-8')
1866                 if not video_description:
1867                         video_description = 'No description available.'
1868
1869                 # Extract video height and width
1870                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1871                 if mobj is None:
1872                         self._downloader.trouble(u'ERROR: unable to extract video height')
1873                         return
1874                 yv_video_height = mobj.group(1)
1875
1876                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1877                 if mobj is None:
1878                         self._downloader.trouble(u'ERROR: unable to extract video width')
1879                         return
1880                 yv_video_width = mobj.group(1)
1881
1882                 # Retrieve video playlist to extract media URL
1883                 # I'm not completely sure what all these options are, but we
1884                 # seem to need most of them, otherwise the server sends a 401.
1885                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1886                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1887                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1888                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1889                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1890                 try:
1891                         self.report_download_webpage(video_id)
1892                         webpage = urllib2.urlopen(request).read()
1893                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1894                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1895                         return
1896
1897                 # Extract media URL from playlist XML
1898                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1899                 if mobj is None:
1900                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1901                         return
1902                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1903                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1904
1905                 try:
1906                         # Process video information
1907                         self._downloader.process_info({
1908                                 'id':           video_id.decode('utf-8'),
1909                                 'url':          video_url,
1910                                 'uploader':     video_uploader,
1911                                 'upload_date':  u'NA',
1912                                 'title':        video_title,
1913                                 'stitle':       simple_title,
1914                                 'ext':          video_extension.decode('utf-8'),
1915                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1916                                 'description':  video_description,
1917                                 'thumbnail':    video_thumbnail,
1918                                 'player_url':   None,
1919                         })
1920                 except UnavailableVideoError:
1921                         self._downloader.trouble(u'\nERROR: unable to download video')
1922
1923
1924 class VimeoIE(InfoExtractor):
1925         """Information extractor for vimeo.com."""
1926
1927         # _VALID_URL matches Vimeo URLs
1928         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1929
1930         def __init__(self, downloader=None):
1931                 InfoExtractor.__init__(self, downloader)
1932
1933         def report_download_webpage(self, video_id):
1934                 """Report webpage download."""
1935                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1936
1937         def report_extraction(self, video_id):
1938                 """Report information extraction."""
1939                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1940
1941         def _real_initialize(self):
1942                 return
1943
1944         def _real_extract(self, url, new_video=True):
1945                 # Extract ID from URL
1946                 mobj = re.match(self._VALID_URL, url)
1947                 if mobj is None:
1948                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1949                         return
1950
1951                 # At this point we have a new video
1952                 self._downloader.increment_downloads()
1953                 video_id = mobj.group(1)
1954
1955                 # Retrieve video webpage to extract further information
1956                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1957                 try:
1958                         self.report_download_webpage(video_id)
1959                         webpage = urllib2.urlopen(request).read()
1960                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1961                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1962                         return
1963
1964                 # Now we begin extracting as much information as we can from what we
1965                 # retrieved. First we extract the information common to all extractors,
1966                 # and latter we extract those that are Vimeo specific.
1967                 self.report_extraction(video_id)
1968
1969                 # Extract title
1970                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1971                 if mobj is None:
1972                         self._downloader.trouble(u'ERROR: unable to extract video title')
1973                         return
1974                 video_title = mobj.group(1).decode('utf-8')
1975                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1976
1977                 # Extract uploader
1978                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1979                 if mobj is None:
1980                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1981                         return
1982                 video_uploader = mobj.group(1).decode('utf-8')
1983
1984                 # Extract video thumbnail
1985                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1986                 if mobj is None:
1987                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1988                         return
1989                 video_thumbnail = mobj.group(1).decode('utf-8')
1990
1991                 # # Extract video description
1992                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1993                 # if mobj is None:
1994                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
1995                 #       return
1996                 # video_description = mobj.group(1).decode('utf-8')
1997                 # if not video_description: video_description = 'No description available.'
1998                 video_description = 'Foo.'
1999
2000                 # Vimeo specific: extract request signature
2001                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2002                 if mobj is None:
2003                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2004                         return
2005                 sig = mobj.group(1).decode('utf-8')
2006
2007                 # Vimeo specific: Extract request signature expiration
2008                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2009                 if mobj is None:
2010                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2011                         return
2012                 sig_exp = mobj.group(1).decode('utf-8')
2013
2014                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2015
2016                 try:
2017                         # Process video information
2018                         self._downloader.process_info({
2019                                 'id':           video_id.decode('utf-8'),
2020                                 'url':          video_url,
2021                                 'uploader':     video_uploader,
2022                                 'upload_date':  u'NA',
2023                                 'title':        video_title,
2024                                 'stitle':       simple_title,
2025                                 'ext':          u'mp4',
2026                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2027                                 'description':  video_description,
2028                                 'thumbnail':    video_thumbnail,
2029                                 'description':  video_description,
2030                                 'player_url':   None,
2031                         })
2032                 except UnavailableVideoError:
2033                         self._downloader.trouble(u'ERROR: unable to download video')
2034
2035
2036 class GenericIE(InfoExtractor):
2037         """Generic last-resort information extractor."""
2038
2039         _VALID_URL = '.*'
2040
2041         def __init__(self, downloader=None):
2042                 InfoExtractor.__init__(self, downloader)
2043
2044         def report_download_webpage(self, video_id):
2045                 """Report webpage download."""
2046                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2047                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2048
2049         def report_extraction(self, video_id):
2050                 """Report information extraction."""
2051                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2052
2053         def _real_initialize(self):
2054                 return
2055
2056         def _real_extract(self, url):
2057                 # At this point we have a new video
2058                 self._downloader.increment_downloads()
2059
2060                 video_id = url.split('/')[-1]
2061                 request = urllib2.Request(url)
2062                 try:
2063                         self.report_download_webpage(video_id)
2064                         webpage = urllib2.urlopen(request).read()
2065                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2066                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2067                         return
2068                 except ValueError, err:
2069                         # since this is the last-resort InfoExtractor, if
2070                         # this error is thrown, it'll be thrown here
2071                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2072                         return
2073
2074                 self.report_extraction(video_id)
2075                 # Start with something easy: JW Player in SWFObject
2076                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2077                 if mobj is None:
2078                         # Broaden the search a little bit
2079                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2080                 if mobj is None:
2081                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2082                         return
2083
2084                 # It's possible that one of the regexes
2085                 # matched, but returned an empty group:
2086                 if mobj.group(1) is None:
2087                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2088                         return
2089
2090                 video_url = urllib.unquote(mobj.group(1))
2091                 video_id = os.path.basename(video_url)
2092
2093                 # here's a fun little line of code for you:
2094                 video_extension = os.path.splitext(video_id)[1][1:]
2095                 video_id = os.path.splitext(video_id)[0]
2096
2097                 # it's tempting to parse this further, but you would
2098                 # have to take into account all the variations like
2099                 #   Video Title - Site Name
2100                 #   Site Name | Video Title
2101                 #   Video Title - Tagline | Site Name
2102                 # and so on and so forth; it's just not practical
2103                 mobj = re.search(r'<title>(.*)</title>', webpage)
2104                 if mobj is None:
2105                         self._downloader.trouble(u'ERROR: unable to extract title')
2106                         return
2107                 video_title = mobj.group(1).decode('utf-8')
2108                 video_title = sanitize_title(video_title)
2109                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2110
2111                 # video uploader is domain name
2112                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2113                 if mobj is None:
2114                         self._downloader.trouble(u'ERROR: unable to extract title')
2115                         return
2116                 video_uploader = mobj.group(1).decode('utf-8')
2117
2118                 try:
2119                         # Process video information
2120                         self._downloader.process_info({
2121                                 'id':           video_id.decode('utf-8'),
2122                                 'url':          video_url.decode('utf-8'),
2123                                 'uploader':     video_uploader,
2124                                 'upload_date':  u'NA',
2125                                 'title':        video_title,
2126                                 'stitle':       simple_title,
2127                                 'ext':          video_extension.decode('utf-8'),
2128                                 'format':       u'NA',
2129                                 'player_url':   None,
2130                         })
2131                 except UnavailableVideoError, err:
2132                         self._downloader.trouble(u'\nERROR: unable to download video')
2133
2134
2135 class YoutubeSearchIE(InfoExtractor):
2136         """Information Extractor for YouTube search queries."""
2137         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2138         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2139         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2140         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2141         _youtube_ie = None
2142         _max_youtube_results = 1000
2143
2144         def __init__(self, youtube_ie, downloader=None):
2145                 InfoExtractor.__init__(self, downloader)
2146                 self._youtube_ie = youtube_ie
2147
2148         def report_download_page(self, query, pagenum):
2149                 """Report attempt to download playlist page with given number."""
2150                 query = query.decode(preferredencoding())
2151                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2152
2153         def _real_initialize(self):
2154                 self._youtube_ie.initialize()
2155
2156         def _real_extract(self, query):
2157                 mobj = re.match(self._VALID_URL, query)
2158                 if mobj is None:
2159                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2160                         return
2161
2162                 prefix, query = query.split(':')
2163                 prefix = prefix[8:]
2164                 query = query.encode('utf-8')
2165                 if prefix == '':
2166                         self._download_n_results(query, 1)
2167                         return
2168                 elif prefix == 'all':
2169                         self._download_n_results(query, self._max_youtube_results)
2170                         return
2171                 else:
2172                         try:
2173                                 n = long(prefix)
2174                                 if n <= 0:
2175                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2176                                         return
2177                                 elif n > self._max_youtube_results:
2178                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2179                                         n = self._max_youtube_results
2180                                 self._download_n_results(query, n)
2181                                 return
2182                         except ValueError: # parsing prefix as integer fails
2183                                 self._download_n_results(query, 1)
2184                                 return
2185
2186         def _download_n_results(self, query, n):
2187                 """Downloads a specified number of results for a query"""
2188
2189                 video_ids = []
2190                 already_seen = set()
2191                 pagenum = 1
2192
2193                 while True:
2194                         self.report_download_page(query, pagenum)
2195                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2196                         request = urllib2.Request(result_url)
2197                         try:
2198                                 page = urllib2.urlopen(request).read()
2199                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2200                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2201                                 return
2202
2203                         # Extract video identifiers
2204                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2205                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2206                                 if video_id not in already_seen:
2207                                         video_ids.append(video_id)
2208                                         already_seen.add(video_id)
2209                                         if len(video_ids) == n:
2210                                                 # Specified n videos reached
2211                                                 for id in video_ids:
2212                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2213                                                 return
2214
2215                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2216                                 for id in video_ids:
2217                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2218                                 return
2219
2220                         pagenum = pagenum + 1
2221
2222
2223 class GoogleSearchIE(InfoExtractor):
2224         """Information Extractor for Google Video search queries."""
2225         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2226         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2227         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2228         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2229         _google_ie = None
2230         _max_google_results = 1000
2231
2232         def __init__(self, google_ie, downloader=None):
2233                 InfoExtractor.__init__(self, downloader)
2234                 self._google_ie = google_ie
2235
2236         def report_download_page(self, query, pagenum):
2237                 """Report attempt to download playlist page with given number."""
2238                 query = query.decode(preferredencoding())
2239                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2240
2241         def _real_initialize(self):
2242                 self._google_ie.initialize()
2243
2244         def _real_extract(self, query):
2245                 mobj = re.match(self._VALID_URL, query)
2246                 if mobj is None:
2247                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2248                         return
2249
2250                 prefix, query = query.split(':')
2251                 prefix = prefix[8:]
2252                 query = query.encode('utf-8')
2253                 if prefix == '':
2254                         self._download_n_results(query, 1)
2255                         return
2256                 elif prefix == 'all':
2257                         self._download_n_results(query, self._max_google_results)
2258                         return
2259                 else:
2260                         try:
2261                                 n = long(prefix)
2262                                 if n <= 0:
2263                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2264                                         return
2265                                 elif n > self._max_google_results:
2266                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2267                                         n = self._max_google_results
2268                                 self._download_n_results(query, n)
2269                                 return
2270                         except ValueError: # parsing prefix as integer fails
2271                                 self._download_n_results(query, 1)
2272                                 return
2273
2274         def _download_n_results(self, query, n):
2275                 """Downloads a specified number of results for a query"""
2276
2277                 video_ids = []
2278                 already_seen = set()
2279                 pagenum = 1
2280
2281                 while True:
2282                         self.report_download_page(query, pagenum)
2283                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2284                         request = urllib2.Request(result_url)
2285                         try:
2286                                 page = urllib2.urlopen(request).read()
2287                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2288                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2289                                 return
2290
2291                         # Extract video identifiers
2292                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2293                                 video_id = mobj.group(1)
2294                                 if video_id not in already_seen:
2295                                         video_ids.append(video_id)
2296                                         already_seen.add(video_id)
2297                                         if len(video_ids) == n:
2298                                                 # Specified n videos reached
2299                                                 for id in video_ids:
2300                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2301                                                 return
2302
2303                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2304                                 for id in video_ids:
2305                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2306                                 return
2307
2308                         pagenum = pagenum + 1
2309
2310
2311 class YahooSearchIE(InfoExtractor):
2312         """Information Extractor for Yahoo! Video search queries."""
2313         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2314         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2315         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2316         _MORE_PAGES_INDICATOR = r'\s*Next'
2317         _yahoo_ie = None
2318         _max_yahoo_results = 1000
2319
2320         def __init__(self, yahoo_ie, downloader=None):
2321                 InfoExtractor.__init__(self, downloader)
2322                 self._yahoo_ie = yahoo_ie
2323
2324         def report_download_page(self, query, pagenum):
2325                 """Report attempt to download playlist page with given number."""
2326                 query = query.decode(preferredencoding())
2327                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2328
2329         def _real_initialize(self):
2330                 self._yahoo_ie.initialize()
2331
2332         def _real_extract(self, query):
2333                 mobj = re.match(self._VALID_URL, query)
2334                 if mobj is None:
2335                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2336                         return
2337
2338                 prefix, query = query.split(':')
2339                 prefix = prefix[8:]
2340                 query = query.encode('utf-8')
2341                 if prefix == '':
2342                         self._download_n_results(query, 1)
2343                         return
2344                 elif prefix == 'all':
2345                         self._download_n_results(query, self._max_yahoo_results)
2346                         return
2347                 else:
2348                         try:
2349                                 n = long(prefix)
2350                                 if n <= 0:
2351                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2352                                         return
2353                                 elif n > self._max_yahoo_results:
2354                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2355                                         n = self._max_yahoo_results
2356                                 self._download_n_results(query, n)
2357                                 return
2358                         except ValueError: # parsing prefix as integer fails
2359                                 self._download_n_results(query, 1)
2360                                 return
2361
2362         def _download_n_results(self, query, n):
2363                 """Downloads a specified number of results for a query"""
2364
2365                 video_ids = []
2366                 already_seen = set()
2367                 pagenum = 1
2368
2369                 while True:
2370                         self.report_download_page(query, pagenum)
2371                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2372                         request = urllib2.Request(result_url)
2373                         try:
2374                                 page = urllib2.urlopen(request).read()
2375                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2376                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2377                                 return
2378
2379                         # Extract video identifiers
2380                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2381                                 video_id = mobj.group(1)
2382                                 if video_id not in already_seen:
2383                                         video_ids.append(video_id)
2384                                         already_seen.add(video_id)
2385                                         if len(video_ids) == n:
2386                                                 # Specified n videos reached
2387                                                 for id in video_ids:
2388                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2389                                                 return
2390
2391                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2392                                 for id in video_ids:
2393                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2394                                 return
2395
2396                         pagenum = pagenum + 1
2397
2398
2399 class YoutubePlaylistIE(InfoExtractor):
2400         """Information Extractor for YouTube playlists."""
2401
2402         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2403         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2404         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2405         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2406         _youtube_ie = None
2407
2408         def __init__(self, youtube_ie, downloader=None):
2409                 InfoExtractor.__init__(self, downloader)
2410                 self._youtube_ie = youtube_ie
2411
2412         def report_download_page(self, playlist_id, pagenum):
2413                 """Report attempt to download playlist page with given number."""
2414                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2415
2416         def _real_initialize(self):
2417                 self._youtube_ie.initialize()
2418
2419         def _real_extract(self, url):
2420                 # Extract playlist id
2421                 mobj = re.match(self._VALID_URL, url)
2422                 if mobj is None:
2423                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2424                         return
2425
2426                 # Single video case
2427                 if mobj.group(3) is not None:
2428                         self._youtube_ie.extract(mobj.group(3))
2429                         return
2430
2431                 # Download playlist pages
2432                 # prefix is 'p' as default for playlists but there are other types that need extra care
2433                 playlist_prefix = mobj.group(1)
2434                 if playlist_prefix == 'a':
2435                         playlist_access = 'artist'
2436                 else:
2437                         playlist_prefix = 'p'
2438                         playlist_access = 'view_play_list'
2439                 playlist_id = mobj.group(2)
2440                 video_ids = []
2441                 pagenum = 1
2442
2443                 while True:
2444                         self.report_download_page(playlist_id, pagenum)
2445                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2446                         try:
2447                                 page = urllib2.urlopen(request).read()
2448                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2449                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2450                                 return
2451
2452                         # Extract video identifiers
2453                         ids_in_page = []
2454                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2455                                 if mobj.group(1) not in ids_in_page:
2456                                         ids_in_page.append(mobj.group(1))
2457                         video_ids.extend(ids_in_page)
2458
2459                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2460                                 break
2461                         pagenum = pagenum + 1
2462
2463                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2464                 playlistend = self._downloader.params.get('playlistend', -1)
2465                 video_ids = video_ids[playliststart:playlistend]
2466
2467                 for id in video_ids:
2468                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2469                 return
2470
2471
2472 class YoutubeUserIE(InfoExtractor):
2473         """Information Extractor for YouTube users."""
2474
2475         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2476         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2477         _GDATA_PAGE_SIZE = 50
2478         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2479         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2480         _youtube_ie = None
2481
2482         def __init__(self, youtube_ie, downloader=None):
2483                 InfoExtractor.__init__(self, downloader)
2484                 self._youtube_ie = youtube_ie
2485
2486         def report_download_page(self, username, start_index):
2487                 """Report attempt to download user page."""
2488                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2489                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2490
2491         def _real_initialize(self):
2492                 self._youtube_ie.initialize()
2493
2494         def _real_extract(self, url):
2495                 # Extract username
2496                 mobj = re.match(self._VALID_URL, url)
2497                 if mobj is None:
2498                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2499                         return
2500
2501                 username = mobj.group(1)
2502
2503                 # Download video ids using YouTube Data API. Result size per
2504                 # query is limited (currently to 50 videos) so we need to query
2505                 # page by page until there are no video ids - it means we got
2506                 # all of them.
2507
2508                 video_ids = []
2509                 pagenum = 0
2510
2511                 while True:
2512                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2513                         self.report_download_page(username, start_index)
2514
2515                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2516
2517                         try:
2518                                 page = urllib2.urlopen(request).read()
2519                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2520                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2521                                 return
2522
2523                         # Extract video identifiers
2524                         ids_in_page = []
2525
2526                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2527                                 if mobj.group(1) not in ids_in_page:
2528                                         ids_in_page.append(mobj.group(1))
2529
2530                         video_ids.extend(ids_in_page)
2531
2532                         # A little optimization - if current page is not
2533                         # "full", ie. does not contain PAGE_SIZE video ids then
2534                         # we can assume that this page is the last one - there
2535                         # are no more ids on further pages - no need to query
2536                         # again.
2537
2538                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2539                                 break
2540
2541                         pagenum += 1
2542
2543                 all_ids_count = len(video_ids)
2544                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2545                 playlistend = self._downloader.params.get('playlistend', -1)
2546
2547                 if playlistend == -1:
2548                         video_ids = video_ids[playliststart:]
2549                 else:
2550                         video_ids = video_ids[playliststart:playlistend]
2551
2552                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2553                                 (username, all_ids_count, len(video_ids)))
2554
2555                 for video_id in video_ids:
2556                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2557
2558
2559 class DepositFilesIE(InfoExtractor):
2560         """Information extractor for depositfiles.com"""
2561
2562         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2563
2564         def __init__(self, downloader=None):
2565                 InfoExtractor.__init__(self, downloader)
2566
2567         def report_download_webpage(self, file_id):
2568                 """Report webpage download."""
2569                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2570
2571         def report_extraction(self, file_id):
2572                 """Report information extraction."""
2573                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2574
2575         def _real_initialize(self):
2576                 return
2577
2578         def _real_extract(self, url):
2579                 # At this point we have a new file
2580                 self._downloader.increment_downloads()
2581
2582                 file_id = url.split('/')[-1]
2583                 # Rebuild url in english locale
2584                 url = 'http://depositfiles.com/en/files/' + file_id
2585
2586                 # Retrieve file webpage with 'Free download' button pressed
2587                 free_download_indication = { 'gateway_result' : '1' }
2588                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2589                 try:
2590                         self.report_download_webpage(file_id)
2591                         webpage = urllib2.urlopen(request).read()
2592                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2593                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2594                         return
2595
2596                 # Search for the real file URL
2597                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2598                 if (mobj is None) or (mobj.group(1) is None):
2599                         # Try to figure out reason of the error.
2600                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2601                         if (mobj is not None) and (mobj.group(1) is not None):
2602                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2603                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2604                         else:
2605                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2606                         return
2607
2608                 file_url = mobj.group(1)
2609                 file_extension = os.path.splitext(file_url)[1][1:]
2610
2611                 # Search for file title
2612                 mobj = re.search(r'<b title="(.*?)">', webpage)
2613                 if mobj is None:
2614                         self._downloader.trouble(u'ERROR: unable to extract title')
2615                         return
2616                 file_title = mobj.group(1).decode('utf-8')
2617
2618                 try:
2619                         # Process file information
2620                         self._downloader.process_info({
2621                                 'id':           file_id.decode('utf-8'),
2622                                 'url':          file_url.decode('utf-8'),
2623                                 'uploader':     u'NA',
2624                                 'upload_date':  u'NA',
2625                                 'title':        file_title,
2626                                 'stitle':       file_title,
2627                                 'ext':          file_extension.decode('utf-8'),
2628                                 'format':       u'NA',
2629                                 'player_url':   None,
2630                         })
2631                 except UnavailableVideoError, err:
2632                         self._downloader.trouble(u'ERROR: unable to download file')
2633
2634
2635 class FacebookIE(InfoExtractor):
2636         """Information Extractor for Facebook"""
2637
2638         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2639         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2640         _NETRC_MACHINE = 'facebook'
2641         _available_formats = ['highqual', 'lowqual']
2642         _video_extensions = {
2643                 'highqual': 'mp4',
2644                 'lowqual': 'mp4',
2645         }
2646
2647         def __init__(self, downloader=None):
2648                 InfoExtractor.__init__(self, downloader)
2649
2650         def _reporter(self, message):
2651                 """Add header and report message."""
2652                 self._downloader.to_screen(u'[facebook] %s' % message)
2653
2654         def report_login(self):
2655                 """Report attempt to log in."""
2656                 self._reporter(u'Logging in')
2657
2658         def report_video_webpage_download(self, video_id):
2659                 """Report attempt to download video webpage."""
2660                 self._reporter(u'%s: Downloading video webpage' % video_id)
2661
2662         def report_information_extraction(self, video_id):
2663                 """Report attempt to extract video information."""
2664                 self._reporter(u'%s: Extracting video information' % video_id)
2665
2666         def _parse_page(self, video_webpage):
2667                 """Extract video information from page"""
2668                 # General data
2669                 data = {'title': r'class="video_title datawrap">(.*?)</',
2670                         'description': r'<div class="datawrap">(.*?)</div>',
2671                         'owner': r'\("video_owner_name", "(.*?)"\)',
2672                         'upload_date': r'data-date="(.*?)"',
2673                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2674                         }
2675                 video_info = {}
2676                 for piece in data.keys():
2677                         mobj = re.search(data[piece], video_webpage)
2678                         if mobj is not None:
2679                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2680
2681                 # Video urls
2682                 video_urls = {}
2683                 for fmt in self._available_formats:
2684                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2685                         if mobj is not None:
2686                                 # URL is in a Javascript segment inside an escaped Unicode format within
2687                                 # the generally utf-8 page
2688                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2689                 video_info['video_urls'] = video_urls
2690
2691                 return video_info
2692
2693         def _real_initialize(self):
2694                 if self._downloader is None:
2695                         return
2696
2697                 useremail = None
2698                 password = None
2699                 downloader_params = self._downloader.params
2700
2701                 # Attempt to use provided username and password or .netrc data
2702                 if downloader_params.get('username', None) is not None:
2703                         useremail = downloader_params['username']
2704                         password = downloader_params['password']
2705                 elif downloader_params.get('usenetrc', False):
2706                         try:
2707                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2708                                 if info is not None:
2709                                         useremail = info[0]
2710                                         password = info[2]
2711                                 else:
2712                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2713                         except (IOError, netrc.NetrcParseError), err:
2714                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2715                                 return
2716
2717                 if useremail is None:
2718                         return
2719
2720                 # Log in
2721                 login_form = {
2722                         'email': useremail,
2723                         'pass': password,
2724                         'login': 'Log+In'
2725                         }
2726                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2727                 try:
2728                         self.report_login()
2729                         login_results = urllib2.urlopen(request).read()
2730                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2731                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2732                                 return
2733                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2734                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2735                         return
2736
2737         def _real_extract(self, url):
2738                 mobj = re.match(self._VALID_URL, url)
2739                 if mobj is None:
2740                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2741                         return
2742                 video_id = mobj.group('ID')
2743
2744                 # Get video webpage
2745                 self.report_video_webpage_download(video_id)
2746                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2747                 try:
2748                         page = urllib2.urlopen(request)
2749                         video_webpage = page.read()
2750                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2751                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2752                         return
2753
2754                 # Start extracting information
2755                 self.report_information_extraction(video_id)
2756
2757                 # Extract information
2758                 video_info = self._parse_page(video_webpage)
2759
2760                 # uploader
2761                 if 'owner' not in video_info:
2762                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2763                         return
2764                 video_uploader = video_info['owner']
2765
2766                 # title
2767                 if 'title' not in video_info:
2768                         self._downloader.trouble(u'ERROR: unable to extract video title')
2769                         return
2770                 video_title = video_info['title']
2771                 video_title = video_title.decode('utf-8')
2772                 video_title = sanitize_title(video_title)
2773
2774                 # simplified title
2775                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2776                 simple_title = simple_title.strip(ur'_')
2777
2778                 # thumbnail image
2779                 if 'thumbnail' not in video_info:
2780                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2781                         video_thumbnail = ''
2782                 else:
2783                         video_thumbnail = video_info['thumbnail']
2784
2785                 # upload date
2786                 upload_date = u'NA'
2787                 if 'upload_date' in video_info:
2788                         upload_time = video_info['upload_date']
2789                         timetuple = email.utils.parsedate_tz(upload_time)
2790                         if timetuple is not None:
2791                                 try:
2792                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2793                                 except:
2794                                         pass
2795
2796                 # description
2797                 video_description = video_info.get('description', 'No description available.')
2798
2799                 url_map = video_info['video_urls']
2800                 if len(url_map.keys()) > 0:
2801                         # Decide which formats to download
2802                         req_format = self._downloader.params.get('format', None)
2803                         format_limit = self._downloader.params.get('format_limit', None)
2804
2805                         if format_limit is not None and format_limit in self._available_formats:
2806                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2807                         else:
2808                                 format_list = self._available_formats
2809                         existing_formats = [x for x in format_list if x in url_map]
2810                         if len(existing_formats) == 0:
2811                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2812                                 return
2813                         if req_format is None:
2814                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2815                         elif req_format == '-1':
2816                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2817                         else:
2818                                 # Specific format
2819                                 if req_format not in url_map:
2820                                         self._downloader.trouble(u'ERROR: requested format not available')
2821                                         return
2822                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2823
2824                 for format_param, video_real_url in video_url_list:
2825
2826                         # At this point we have a new video
2827                         self._downloader.increment_downloads()
2828
2829                         # Extension
2830                         video_extension = self._video_extensions.get(format_param, 'mp4')
2831
2832                         try:
2833                                 # Process video information
2834                                 self._downloader.process_info({
2835                                         'id':           video_id.decode('utf-8'),
2836                                         'url':          video_real_url.decode('utf-8'),
2837                                         'uploader':     video_uploader.decode('utf-8'),
2838                                         'upload_date':  upload_date,
2839                                         'title':        video_title,
2840                                         'stitle':       simple_title,
2841                                         'ext':          video_extension.decode('utf-8'),
2842                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2843                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2844                                         'description':  video_description.decode('utf-8'),
2845                                         'player_url':   None,
2846                                 })
2847                         except UnavailableVideoError, err:
2848                                 self._downloader.trouble(u'\nERROR: unable to download video')
2849
2850 class BlipTVIE(InfoExtractor):
2851         """Information extractor for blip.tv"""
2852
2853         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2854         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2855
2856         def report_extraction(self, file_id):
2857                 """Report information extraction."""
2858                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2859
2860         def _simplify_title(self, title):
2861                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2862                 res = res.strip(ur'_')
2863                 return res
2864
2865         def _real_extract(self, url):
2866                 mobj = re.match(self._VALID_URL, url)
2867                 if mobj is None:
2868                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2869                         return
2870
2871                 if '?' in url:
2872                         cchar = '&'
2873                 else:
2874                         cchar = '?'
2875                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2876                 request = urllib2.Request(json_url)
2877                 self.report_extraction(mobj.group(1))
2878                 try:
2879                         json_code = urllib2.urlopen(request).read()
2880                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2881                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2882                         return
2883                 try:
2884                         json_data = json.loads(json_code)
2885                         if 'Post' in json_data:
2886                                 data = json_data['Post']
2887                         else:
2888                                 data = json_data
2889
2890                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2891                         video_url = data['media']['url']
2892                         umobj = re.match(self._URL_EXT, video_url)
2893                         if umobj is None:
2894                                 raise ValueError('Can not determine filename extension')
2895                         ext = umobj.group(1)
2896
2897                         self._downloader.increment_downloads()
2898
2899                         info = {
2900                                 'id': data['item_id'],
2901                                 'url': video_url,
2902                                 'uploader': data['display_name'],
2903                                 'upload_date': upload_date,
2904                                 'title': data['title'],
2905                                 'stitle': self._simplify_title(data['title']),
2906                                 'ext': ext,
2907                                 'format': data['media']['mimeType'],
2908                                 'thumbnail': data['thumbnailUrl'],
2909                                 'description': data['description'],
2910                                 'player_url': data['embedUrl']
2911                         }
2912                 except (ValueError,KeyError), err:
2913                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2914                         return
2915
2916                 try:
2917                         self._downloader.process_info(info)
2918                 except UnavailableVideoError, err:
2919                         self._downloader.trouble(u'\nERROR: unable to download video')
2920
2921
2922 class MyVideoIE(InfoExtractor):
2923         """Information Extractor for myvideo.de."""
2924
2925         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2926
2927         def __init__(self, downloader=None):
2928                 InfoExtractor.__init__(self, downloader)
2929         
2930         def report_download_webpage(self, video_id):
2931                 """Report webpage download."""
2932                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2933
2934         def report_extraction(self, video_id):
2935                 """Report information extraction."""
2936                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2937
2938         def _real_initialize(self):
2939                 return
2940
2941         def _real_extract(self,url):
2942                 mobj = re.match(self._VALID_URL, url)
2943                 if mobj is None:
2944                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2945                         return
2946
2947                 video_id = mobj.group(1)
2948                 simple_title = mobj.group(2).decode('utf-8')
2949                 # should actually not be necessary
2950                 simple_title = sanitize_title(simple_title)
2951                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2952
2953                 # Get video webpage
2954                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2955                 try:
2956                         self.report_download_webpage(video_id)
2957                         webpage = urllib2.urlopen(request).read()
2958                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2959                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2960                         return
2961
2962                 self.report_extraction(video_id)
2963                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2964                                  webpage)
2965                 if mobj is None:
2966                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2967                         return
2968                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2969
2970                 mobj = re.search('<title>([^<]+)</title>', webpage)
2971                 if mobj is None:
2972                         self._downloader.trouble(u'ERROR: unable to extract title')
2973                         return
2974
2975                 video_title = mobj.group(1)
2976                 video_title = sanitize_title(video_title)
2977
2978                 try:
2979                         print(video_url)
2980                         self._downloader.process_info({
2981                                 'id':           video_id,
2982                                 'url':          video_url,
2983                                 'uploader':     u'NA',
2984                                 'upload_date':  u'NA',
2985                                 'title':        video_title,
2986                                 'stitle':       simple_title,
2987                                 'ext':          u'flv',
2988                                 'format':       u'NA',
2989                                 'player_url':   None,
2990                         })
2991                 except UnavailableVideoError:
2992                         self._downloader.trouble(u'\nERROR: Unable to download video')
2993
2994 class ComedyCentralIE(InfoExtractor):
2995         """Information extractor for The Daily Show and Colbert Report """
2996
2997         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2998
2999         def report_extraction(self, episode_id):
3000                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3001         
3002         def report_config_download(self, episode_id):
3003                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3004
3005         def report_index_download(self, episode_id):
3006                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3007
3008         def report_player_url(self, episode_id):
3009                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3010
3011         def _simplify_title(self, title):
3012                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3013                 res = res.strip(ur'_')
3014                 return res
3015
3016         def _real_extract(self, url):
3017                 mobj = re.match(self._VALID_URL, url)
3018                 if mobj is None:
3019                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3020                         return
3021
3022                 if mobj.group('shortname'):
3023                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3024                                 url = 'http://www.thedailyshow.com/full-episodes/'
3025                         else:
3026                                 url = 'http://www.colbertnation.com/full-episodes/'
3027                         mobj = re.match(self._VALID_URL, url)
3028                         assert mobj is not None
3029
3030                 dlNewest = not mobj.group('episode')
3031                 if dlNewest:
3032                         epTitle = mobj.group('showname')
3033                 else:
3034                         epTitle = mobj.group('episode')
3035
3036                 req = urllib2.Request(url)
3037                 self.report_extraction(epTitle)
3038                 try:
3039                         htmlHandle = urllib2.urlopen(req)
3040                         html = htmlHandle.read()
3041                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3042                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3043                         return
3044                 if dlNewest:
3045                         url = htmlHandle.geturl()
3046                         mobj = re.match(self._VALID_URL, url)
3047                         if mobj is None:
3048                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3049                                 return
3050                         if mobj.group('episode') == '':
3051                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3052                                 return
3053                         epTitle = mobj.group('episode')
3054
3055                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3056                 if len(mMovieParams) == 0:
3057                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3058                         return
3059
3060                 playerUrl_raw = mMovieParams[0][0]
3061                 self.report_player_url(epTitle)
3062                 try:
3063                         urlHandle = urllib2.urlopen(playerUrl_raw)
3064                         playerUrl = urlHandle.geturl()
3065                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3066                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3067                         return
3068
3069                 uri = mMovieParams[0][1]
3070                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3071                 self.report_index_download(epTitle)
3072                 try:
3073                         indexXml = urllib2.urlopen(indexUrl).read()
3074                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3075                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3076                         return
3077
3078                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3079                 itemEls = idoc.findall('.//item')
3080                 for itemEl in itemEls:
3081                         mediaId = itemEl.findall('./guid')[0].text
3082                         shortMediaId = mediaId.split(':')[-1]
3083                         showId = mediaId.split(':')[-2].replace('.com', '')
3084                         officialTitle = itemEl.findall('./title')[0].text
3085                         officialDate = itemEl.findall('./pubDate')[0].text
3086
3087                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3088                                                 urllib.urlencode({'uri': mediaId}))
3089                         configReq = urllib2.Request(configUrl)
3090                         self.report_config_download(epTitle)
3091                         try:
3092                                 configXml = urllib2.urlopen(configReq).read()
3093                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3094                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3095                                 return
3096
3097                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3098                         turls = []
3099                         for rendition in cdoc.findall('.//rendition'):
3100                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3101                                 turls.append(finfo)
3102
3103                         if len(turls) == 0:
3104                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3105                                 continue
3106
3107                         # For now, just pick the highest bitrate
3108                         format,video_url = turls[-1]
3109
3110                         self._downloader.increment_downloads()
3111
3112                         effTitle = showId + '-' + epTitle
3113                         info = {
3114                                 'id': shortMediaId,
3115                                 'url': video_url,
3116                                 'uploader': showId,
3117                                 'upload_date': officialDate,
3118                                 'title': effTitle,
3119                                 'stitle': self._simplify_title(effTitle),
3120                                 'ext': 'mp4',
3121                                 'format': format,
3122                                 'thumbnail': None,
3123                                 'description': officialTitle,
3124                                 'player_url': playerUrl
3125                         }
3126
3127                         try:
3128                                 self._downloader.process_info(info)
3129                         except UnavailableVideoError, err:
3130                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3131                                 continue
3132
3133
3134 class EscapistIE(InfoExtractor):
3135         """Information extractor for The Escapist """
3136
3137         _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3138
3139         def report_extraction(self, showName):
3140                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3141
3142         def report_config_download(self, showName):
3143                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3144
3145         def _simplify_title(self, title):
3146                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3147                 res = res.strip(ur'_')
3148                 return res
3149
3150         def _real_extract(self, url):
3151                 htmlParser = HTMLParser.HTMLParser()
3152
3153                 mobj = re.match(self._VALID_URL, url)
3154                 if mobj is None:
3155                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3156                         return
3157                 showName = mobj.group('showname')
3158                 videoId = mobj.group('episode')
3159
3160                 self.report_extraction(showName)
3161                 try:
3162                         webPage = urllib2.urlopen(url).read()
3163                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3164                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3165                         return
3166
3167                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3168                 description = htmlParser.unescape(descMatch.group(1))
3169                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3170                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3171                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3172                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3173                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3174                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3175
3176                 self.report_config_download(showName)
3177                 try:
3178                         configJSON = urllib2.urlopen(configUrl).read()
3179                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3180                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3181                         return
3182
3183                 # Technically, it's JavaScript, not JSON
3184                 configJSON = configJSON.replace("'", '"')
3185
3186                 try:
3187                         config = json.loads(configJSON)
3188                 except (ValueError,), err:
3189                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3190                         return
3191
3192                 playlist = config['playlist']
3193                 videoUrl = playlist[1]['url']
3194
3195                 self._downloader.increment_downloads()
3196                 info = {
3197                         'id': videoId,
3198                         'url': videoUrl,
3199                         'uploader': showName,
3200                         'upload_date': None,
3201                         'title': showName,
3202                         'stitle': self._simplify_title(showName),
3203                         'ext': 'flv',
3204                         'format': 'flv',
3205                         'thumbnail': imgUrl,
3206                         'description': description,
3207                         'player_url': playerUrl,
3208                 }
3209
3210                 try:
3211                         self._downloader.process_info(info)
3212                 except UnavailableVideoError, err:
3213                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3214
3215
3216
3217 class PostProcessor(object):
3218         """Post Processor class.
3219
3220         PostProcessor objects can be added to downloaders with their
3221         add_post_processor() method. When the downloader has finished a
3222         successful download, it will take its internal chain of PostProcessors
3223         and start calling the run() method on each one of them, first with
3224         an initial argument and then with the returned value of the previous
3225         PostProcessor.
3226
3227         The chain will be stopped if one of them ever returns None or the end
3228         of the chain is reached.
3229
3230         PostProcessor objects follow a "mutual registration" process similar
3231         to InfoExtractor objects.
3232         """
3233
3234         _downloader = None
3235
3236         def __init__(self, downloader=None):
3237                 self._downloader = downloader
3238
3239         def set_downloader(self, downloader):
3240                 """Sets the downloader for this PP."""
3241                 self._downloader = downloader
3242
3243         def run(self, information):
3244                 """Run the PostProcessor.
3245
3246                 The "information" argument is a dictionary like the ones
3247                 composed by InfoExtractors. The only difference is that this
3248                 one has an extra field called "filepath" that points to the
3249                 downloaded file.
3250
3251                 When this method returns None, the postprocessing chain is
3252                 stopped. However, this method may return an information
3253                 dictionary that will be passed to the next postprocessing
3254                 object in the chain. It can be the one it received after
3255                 changing some fields.
3256
3257                 In addition, this method may raise a PostProcessingError
3258                 exception that will be taken into account by the downloader
3259                 it was called from.
3260                 """
3261                 return information # by default, do nothing
3262
3263
3264 class FFmpegExtractAudioPP(PostProcessor):
3265
3266         def __init__(self, downloader=None, preferredcodec=None):
3267                 PostProcessor.__init__(self, downloader)
3268                 if preferredcodec is None:
3269                         preferredcodec = 'best'
3270                 self._preferredcodec = preferredcodec
3271
3272         @staticmethod
3273         def get_audio_codec(path):
3274                 try:
3275                         cmd = ['ffprobe', '-show_streams', '--', path]
3276                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3277                         output = handle.communicate()[0]
3278                         if handle.wait() != 0:
3279                                 return None
3280                 except (IOError, OSError):
3281                         return None
3282                 audio_codec = None
3283                 for line in output.split('\n'):
3284                         if line.startswith('codec_name='):
3285                                 audio_codec = line.split('=')[1].strip()
3286                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3287                                 return audio_codec
3288                 return None
3289
3290         @staticmethod
3291         def run_ffmpeg(path, out_path, codec, more_opts):
3292                 try:
3293                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3294                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3295                         return (ret == 0)
3296                 except (IOError, OSError):
3297                         return False
3298
3299         def run(self, information):
3300                 path = information['filepath']
3301
3302                 filecodec = self.get_audio_codec(path)
3303                 if filecodec is None:
3304                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3305                         return None
3306
3307                 more_opts = []
3308                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3309                         if filecodec == 'aac' or filecodec == 'mp3':
3310                                 # Lossless if possible
3311                                 acodec = 'copy'
3312                                 extension = filecodec
3313                                 if filecodec == 'aac':
3314                                         more_opts = ['-f', 'adts']
3315                         else:
3316                                 # MP3 otherwise.
3317                                 acodec = 'libmp3lame'
3318                                 extension = 'mp3'
3319                                 more_opts = ['-ab', '128k']
3320                 else:
3321                         # We convert the audio (lossy)
3322                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3323                         extension = self._preferredcodec
3324                         more_opts = ['-ab', '128k']
3325                         if self._preferredcodec == 'aac':
3326                                 more_opts += ['-f', 'adts']
3327
3328                 (prefix, ext) = os.path.splitext(path)
3329                 new_path = prefix + '.' + extension
3330                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3331                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3332
3333                 if not status:
3334                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3335                         return None
3336
3337                 try:
3338                         os.remove(path)
3339                 except (IOError, OSError):
3340                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3341                         return None
3342
3343                 information['filepath'] = new_path
3344                 return information
3345
3346
3347 def updateSelf(downloader, filename):
3348         ''' Update the program file with the latest version from the repository '''
3349         # Note: downloader only used for options
3350         if not os.access(filename, os.W_OK):
3351                 sys.exit('ERROR: no write permissions on %s' % filename)
3352
3353         downloader.to_screen('Updating to latest version...')
3354
3355         try:
3356                 try:
3357                         urlh = urllib.urlopen(UPDATE_URL)
3358                         newcontent = urlh.read()
3359                 finally:
3360                         urlh.close()
3361         except (IOError, OSError), err:
3362                 sys.exit('ERROR: unable to download latest version')
3363
3364         try:
3365                 outf = open(filename, 'wb')
3366                 try:
3367                         outf.write(newcontent)
3368                 finally:
3369                         outf.close()
3370         except (IOError, OSError), err:
3371                 sys.exit('ERROR: unable to overwrite current version')
3372
3373         downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3374
3375 def parseOpts():
3376         # Deferred imports
3377         import getpass
3378         import optparse
3379
3380         def _format_option_string(option):
3381                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3382
3383                 opts = []
3384
3385                 if option._short_opts: opts.append(option._short_opts[0])
3386                 if option._long_opts: opts.append(option._long_opts[0])
3387                 if len(opts) > 1: opts.insert(1, ', ')
3388
3389                 if option.takes_value(): opts.append(' %s' % option.metavar)
3390
3391                 return "".join(opts)
3392
3393         def _find_term_columns():
3394                 columns = os.environ.get('COLUMNS', None)
3395                 if columns:
3396                         return int(columns)
3397
3398                 try:
3399                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3400                         out,err = sp.communicate()
3401                         return int(out.split()[1])
3402                 except:
3403                         pass
3404                 return None
3405
3406         max_width = 80
3407         max_help_position = 80
3408
3409         # No need to wrap help messages if we're on a wide console
3410         columns = _find_term_columns()
3411         if columns: max_width = columns
3412
3413         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3414         fmt.format_option_strings = _format_option_string
3415
3416         kw = {
3417                 'version'   : __version__,
3418                 'formatter' : fmt,
3419                 'usage' : '%prog [options] url [url...]',
3420                 'conflict_handler' : 'resolve',
3421         }
3422
3423         parser = optparse.OptionParser(**kw)
3424
3425         # option groups
3426         general        = optparse.OptionGroup(parser, 'General Options')
3427         selection      = optparse.OptionGroup(parser, 'Video Selection')
3428         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3429         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3430         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3431         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3432         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3433
3434         general.add_option('-h', '--help',
3435                         action='help', help='print this help text and exit')
3436         general.add_option('-v', '--version',
3437                         action='version', help='print program version and exit')
3438         general.add_option('-U', '--update',
3439                         action='store_true', dest='update_self', help='update this program to latest version')
3440         general.add_option('-i', '--ignore-errors',
3441                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3442         general.add_option('-r', '--rate-limit',
3443                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3444         general.add_option('-R', '--retries',
3445                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3446         general.add_option('--dump-user-agent',
3447                         action='store_true', dest='dump_user_agent',
3448                         help='display the current browser identification', default=False)
3449
3450         selection.add_option('--playlist-start',
3451                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3452         selection.add_option('--playlist-end',
3453                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3454         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3455         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3456
3457         authentication.add_option('-u', '--username',
3458                         dest='username', metavar='USERNAME', help='account username')
3459         authentication.add_option('-p', '--password',
3460                         dest='password', metavar='PASSWORD', help='account password')
3461         authentication.add_option('-n', '--netrc',
3462                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3463
3464
3465         video_format.add_option('-f', '--format',
3466                         action='store', dest='format', metavar='FORMAT', help='video format code')
3467         video_format.add_option('--all-formats',
3468                         action='store_const', dest='format', help='download all available video formats', const='-1')
3469         video_format.add_option('--max-quality',
3470                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3471
3472
3473         verbosity.add_option('-q', '--quiet',
3474                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3475         verbosity.add_option('-s', '--simulate',
3476                         action='store_true', dest='simulate', help='do not download video', default=False)
3477         verbosity.add_option('-g', '--get-url',
3478                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3479         verbosity.add_option('-e', '--get-title',
3480                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3481         verbosity.add_option('--get-thumbnail',
3482                         action='store_true', dest='getthumbnail',
3483                         help='simulate, quiet but print thumbnail URL', default=False)
3484         verbosity.add_option('--get-description',
3485                         action='store_true', dest='getdescription',
3486                         help='simulate, quiet but print video description', default=False)
3487         verbosity.add_option('--get-filename',
3488                         action='store_true', dest='getfilename',
3489                         help='simulate, quiet but print output filename', default=False)
3490         verbosity.add_option('--no-progress',
3491                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3492         verbosity.add_option('--console-title',
3493                         action='store_true', dest='consoletitle',
3494                         help='display progress in console titlebar', default=False)
3495
3496
3497         filesystem.add_option('-t', '--title',
3498                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3499         filesystem.add_option('-l', '--literal',
3500                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3501         filesystem.add_option('-A', '--auto-number',
3502                         action='store_true', dest='autonumber',
3503                         help='number downloaded files starting from 00000', default=False)
3504         filesystem.add_option('-o', '--output',
3505                         dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3506         filesystem.add_option('-a', '--batch-file',
3507                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3508         filesystem.add_option('-w', '--no-overwrites',
3509                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3510         filesystem.add_option('-c', '--continue',
3511                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3512         filesystem.add_option('--cookies',
3513                         dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3514         filesystem.add_option('--no-part',
3515                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3516         filesystem.add_option('--no-mtime',
3517                         action='store_false', dest='updatetime',
3518                         help='do not use the Last-modified header to set the file modification time', default=True)
3519         filesystem.add_option('--write-description',
3520                         action='store_true', dest='writedescription',
3521                         help='write video description to a .description file', default=False)
3522         filesystem.add_option('--write-info-json',
3523                         action='store_true', dest='writeinfojson',
3524                         help='write video metadata to a .info.json file', default=False)
3525
3526
3527         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3528                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3529         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3530                         help='"best", "aac" or "mp3"; best by default')
3531
3532
3533         parser.add_option_group(general)
3534         parser.add_option_group(selection)
3535         parser.add_option_group(filesystem)
3536         parser.add_option_group(verbosity)
3537         parser.add_option_group(video_format)
3538         parser.add_option_group(authentication)
3539         parser.add_option_group(postproc)
3540
3541         opts, args = parser.parse_args()
3542
3543         return parser, opts, args
3544
3545 def main():
3546         parser, opts, args = parseOpts()
3547
3548         # Open appropriate CookieJar
3549         if opts.cookiefile is None:
3550                 jar = cookielib.CookieJar()
3551         else:
3552                 try:
3553                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3554                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3555                                 jar.load()
3556                 except (IOError, OSError), err:
3557                         sys.exit(u'ERROR: unable to open cookie file')
3558
3559         # Dump user agent
3560         if opts.dump_user_agent:
3561                 print std_headers['User-Agent']
3562                 sys.exit(0)
3563
3564         # General configuration
3565         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3566         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3567         urllib2.install_opener(opener)
3568         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3569
3570         # Batch file verification
3571         batchurls = []
3572         if opts.batchfile is not None:
3573                 try:
3574                         if opts.batchfile == '-':
3575                                 batchfd = sys.stdin
3576                         else:
3577                                 batchfd = open(opts.batchfile, 'r')
3578                         batchurls = batchfd.readlines()
3579                         batchurls = [x.strip() for x in batchurls]
3580                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3581                 except IOError:
3582                         sys.exit(u'ERROR: batch file could not be read')
3583         all_urls = batchurls + args
3584
3585         # Conflicting, missing and erroneous options
3586         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3587                 parser.error(u'using .netrc conflicts with giving username/password')
3588         if opts.password is not None and opts.username is None:
3589                 parser.error(u'account username missing')
3590         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3591                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3592         if opts.usetitle and opts.useliteral:
3593                 parser.error(u'using title conflicts with using literal title')
3594         if opts.username is not None and opts.password is None:
3595                 opts.password = getpass.getpass(u'Type account password and press return:')
3596         if opts.ratelimit is not None:
3597                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3598                 if numeric_limit is None:
3599                         parser.error(u'invalid rate limit specified')
3600                 opts.ratelimit = numeric_limit
3601         if opts.retries is not None:
3602                 try:
3603                         opts.retries = long(opts.retries)
3604                 except (TypeError, ValueError), err:
3605                         parser.error(u'invalid retry count specified')
3606         try:
3607                 opts.playliststart = int(opts.playliststart)
3608                 if opts.playliststart <= 0:
3609                         raise ValueError(u'Playlist start must be positive')
3610         except (TypeError, ValueError), err:
3611                 parser.error(u'invalid playlist start number specified')
3612         try:
3613                 opts.playlistend = int(opts.playlistend)
3614                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3615                         raise ValueError(u'Playlist end must be greater than playlist start')
3616         except (TypeError, ValueError), err:
3617                 parser.error(u'invalid playlist end number specified')
3618         if opts.extractaudio:
3619                 if opts.audioformat not in ['best', 'aac', 'mp3']:
3620                         parser.error(u'invalid audio format specified')
3621
3622         # Information extractors
3623         youtube_ie = YoutubeIE()
3624         google_ie = GoogleIE()
3625         yahoo_ie = YahooIE()
3626         extractors = [ # Order does matter
3627                 youtube_ie,
3628                 MetacafeIE(youtube_ie),
3629                 DailymotionIE(),
3630                 YoutubePlaylistIE(youtube_ie),
3631                 YoutubeUserIE(youtube_ie),
3632                 YoutubeSearchIE(youtube_ie),
3633                 google_ie,
3634                 GoogleSearchIE(google_ie),
3635                 PhotobucketIE(),
3636                 yahoo_ie,
3637                 YahooSearchIE(yahoo_ie),
3638                 DepositFilesIE(),
3639                 FacebookIE(),
3640                 BlipTVIE(),
3641                 VimeoIE(),
3642                 MyVideoIE(),
3643                 ComedyCentralIE(),
3644                 EscapistIE(),
3645
3646                 GenericIE()
3647         ]
3648
3649         # File downloader
3650         fd = FileDownloader({
3651                 'usenetrc': opts.usenetrc,
3652                 'username': opts.username,
3653                 'password': opts.password,
3654                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3655                 'forceurl': opts.geturl,
3656                 'forcetitle': opts.gettitle,
3657                 'forcethumbnail': opts.getthumbnail,
3658                 'forcedescription': opts.getdescription,
3659                 'forcefilename': opts.getfilename,
3660                 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3661                 'format': opts.format,
3662                 'format_limit': opts.format_limit,
3663                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3664                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3665                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3666                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3667                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3668                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3669                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3670                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3671                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3672                         or u'%(id)s.%(ext)s'),
3673                 'ignoreerrors': opts.ignoreerrors,
3674                 'ratelimit': opts.ratelimit,
3675                 'nooverwrites': opts.nooverwrites,
3676                 'retries': opts.retries,
3677                 'continuedl': opts.continue_dl,
3678                 'noprogress': opts.noprogress,
3679                 'playliststart': opts.playliststart,
3680                 'playlistend': opts.playlistend,
3681                 'logtostderr': opts.outtmpl == '-',
3682                 'consoletitle': opts.consoletitle,
3683                 'nopart': opts.nopart,
3684                 'updatetime': opts.updatetime,
3685                 'writedescription': opts.writedescription,
3686                 'writeinfojson': opts.writeinfojson,
3687                 'matchtitle': opts.matchtitle,
3688                 'rejecttitle': opts.rejecttitle,
3689                 })
3690         for extractor in extractors:
3691                 fd.add_info_extractor(extractor)
3692
3693         # PostProcessors
3694         if opts.extractaudio:
3695                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3696
3697         # Update version
3698         if opts.update_self:
3699                 updateSelf(fd, sys.argv[0])
3700
3701         # Maybe do nothing
3702         if len(all_urls) < 1:
3703                 if not opts.update_self:
3704                         parser.error(u'you must provide at least one URL')
3705                 else:
3706                         sys.exit()
3707         retcode = fd.download(all_urls)
3708
3709         # Dump cookie jar if requested
3710         if opts.cookiefile is not None:
3711                 try:
3712                         jar.save()
3713                 except (IOError, OSError), err:
3714                         sys.exit(u'ERROR: unable to save cookie jar')
3715
3716         sys.exit(retcode)
3717
3718
3719 if __name__ == '__main__':
3720         try:
3721                 main()
3722         except DownloadError:
3723                 sys.exit(1)
3724         except SameFileError:
3725                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3726         except KeyboardInterrupt:
3727                 sys.exit(u'\nERROR: Interrupted by user')
3728
3729 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: