xvideos: Normalize the URL or it will fail with some inputs.
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         )
16
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.30'
19
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
21
22 import cookielib
23 import datetime
24 import gzip
25 import htmlentitydefs
26 import HTMLParser
27 import httplib
28 import locale
29 import math
30 import netrc
31 import os
32 import os.path
33 import re
34 import socket
35 import string
36 import subprocess
37 import sys
38 import time
39 import urllib
40 import urllib2
41 import warnings
42 import zlib
43
44 if os.name == 'nt':
45         import ctypes
46
47 try:
48         import email.utils
49 except ImportError: # Python 2.4
50         import email.Utils
51 try:
52         import cStringIO as StringIO
53 except ImportError:
54         import StringIO
55
56 # parse_qs was moved from the cgi module to the urlparse module recently.
57 try:
58         from urlparse import parse_qs
59 except ImportError:
60         from cgi import parse_qs
61
62 try:
63         import lxml.etree
64 except ImportError:
65         pass # Handled below
66
67 try:
68         import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
71
72 std_headers = {
73         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76         'Accept-Encoding': 'gzip, deflate',
77         'Accept-Language': 'en-us,en;q=0.5',
78 }
79
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
82 try:
83         import json
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
85         import re
86         class json(object):
87                 @staticmethod
88                 def loads(s):
89                         s = s.decode('UTF-8')
90                         def raiseError(msg, i):
91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92                         def skipSpace(i, expectMore=True):
93                                 while i < len(s) and s[i] in ' \t\r\n':
94                                         i += 1
95                                 if expectMore:
96                                         if i >= len(s):
97                                                 raiseError('Premature end', i)
98                                 return i
99                         def decodeEscape(match):
100                                 esc = match.group(1)
101                                 _STATIC = {
102                                         '"': '"',
103                                         '\\': '\\',
104                                         '/': '/',
105                                         'b': unichr(0x8),
106                                         'f': unichr(0xc),
107                                         'n': '\n',
108                                         'r': '\r',
109                                         't': '\t',
110                                 }
111                                 if esc in _STATIC:
112                                         return _STATIC[esc]
113                                 if esc[0] == 'u':
114                                         if len(esc) == 1+4:
115                                                 return unichr(int(esc[1:5], 16))
116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
117                                                 hi = int(esc[1:5], 16)
118                                                 low = int(esc[7:11], 16)
119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120                                 raise ValueError('Unknown escape ' + str(esc))
121                         def parseString(i):
122                                 i += 1
123                                 e = i
124                                 while True:
125                                         e = s.index('"', e)
126                                         bslashes = 0
127                                         while s[e-bslashes-1] == '\\':
128                                                 bslashes += 1
129                                         if bslashes % 2 == 1:
130                                                 e += 1
131                                                 continue
132                                         break
133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134                                 stri = rexp.sub(decodeEscape, s[i:e])
135                                 return (e+1,stri)
136                         def parseObj(i):
137                                 i += 1
138                                 res = {}
139                                 i = skipSpace(i)
140                                 if s[i] == '}': # Empty dictionary
141                                         return (i+1,res)
142                                 while True:
143                                         if s[i] != '"':
144                                                 raiseError('Expected a string object key', i)
145                                         i,key = parseString(i)
146                                         i = skipSpace(i)
147                                         if i >= len(s) or s[i] != ':':
148                                                 raiseError('Expected a colon', i)
149                                         i,val = parse(i+1)
150                                         res[key] = val
151                                         i = skipSpace(i)
152                                         if s[i] == '}':
153                                                 return (i+1, res)
154                                         if s[i] != ',':
155                                                 raiseError('Expected comma or closing curly brace', i)
156                                         i = skipSpace(i+1)
157                         def parseArray(i):
158                                 res = []
159                                 i = skipSpace(i+1)
160                                 if s[i] == ']': # Empty array
161                                         return (i+1,res)
162                                 while True:
163                                         i,val = parse(i)
164                                         res.append(val)
165                                         i = skipSpace(i) # Raise exception if premature end
166                                         if s[i] == ']':
167                                                 return (i+1, res)
168                                         if s[i] != ',':
169                                                 raiseError('Expected a comma or closing bracket', i)
170                                         i = skipSpace(i+1)
171                         def parseDiscrete(i):
172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
173                                         if s.startswith(k, i):
174                                                 return (i+len(k), v)
175                                 raiseError('Not a boolean (or null)', i)
176                         def parseNumber(i):
177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178                                 if mobj is None:
179                                         raiseError('Not a number', i)
180                                 nums = mobj.group(1)
181                                 if '.' in nums or 'e' in nums or 'E' in nums:
182                                         return (i+len(nums), float(nums))
183                                 return (i+len(nums), int(nums))
184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185                         def parse(i):
186                                 i = skipSpace(i)
187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
188                                 i = skipSpace(i, False)
189                                 return (i,res)
190                         i,res = parse(0)
191                         if i < len(s):
192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193                         return res
194
195 def preferredencoding():
196         """Get preferred encoding.
197
198         Returns the best encoding scheme for the system, based on
199         locale.getpreferredencoding() and some further tweaks.
200         """
201         def yield_preferredencoding():
202                 try:
203                         pref = locale.getpreferredencoding()
204                         u'TEST'.encode(pref)
205                 except:
206                         pref = 'UTF-8'
207                 while True:
208                         yield pref
209         return yield_preferredencoding().next()
210
211
212 def htmlentity_transform(matchobj):
213         """Transforms an HTML entity to a Unicode character.
214
215         This function receives a match object and is intended to be used with
216         the re.sub() function.
217         """
218         entity = matchobj.group(1)
219
220         # Known non-numeric HTML entity
221         if entity in htmlentitydefs.name2codepoint:
222                 return unichr(htmlentitydefs.name2codepoint[entity])
223
224         # Unicode character
225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
226         if mobj is not None:
227                 numstr = mobj.group(1)
228                 if numstr.startswith(u'x'):
229                         base = 16
230                         numstr = u'0%s' % numstr
231                 else:
232                         base = 10
233                 return unichr(long(numstr, base))
234
235         # Unknown entity in name, return its literal representation
236         return (u'&%s;' % entity)
237
238
239 def sanitize_title(utitle):
240         """Sanitizes a video title so it could be used as part of a filename."""
241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242         return utitle.replace(unicode(os.sep), u'%')
243
244
245 def sanitize_open(filename, open_mode):
246         """Try to open the given filename, and slightly tweak it if this fails.
247
248         Attempts to open the given filename. If this fails, it tries to change
249         the filename slightly, step by step, until it's either able to open it
250         or it fails and raises a final exception, like the standard open()
251         function.
252
253         It returns the tuple (stream, definitive_file_name).
254         """
255         try:
256                 if filename == u'-':
257                         if sys.platform == 'win32':
258                                 import msvcrt
259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260                         return (sys.stdout, filename)
261                 stream = open(filename, open_mode)
262                 return (stream, filename)
263         except (IOError, OSError), err:
264                 # In case of error, try to remove win32 forbidden chars
265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
266
267                 # An exception here should be caught in the caller
268                 stream = open(filename, open_mode)
269                 return (stream, filename)
270
271
272 def timeconvert(timestr):
273         """Convert RFC 2822 defined time string into system timestamp"""
274         timestamp = None
275         timetuple = email.utils.parsedate_tz(timestr)
276         if timetuple is not None:
277                 timestamp = email.utils.mktime_tz(timetuple)
278         return timestamp
279
280
281 class DownloadError(Exception):
282         """Download Error exception.
283
284         This exception may be thrown by FileDownloader objects if they are not
285         configured to continue on errors. They will contain the appropriate
286         error message.
287         """
288         pass
289
290
291 class SameFileError(Exception):
292         """Same File exception.
293
294         This exception will be thrown by FileDownloader objects if they detect
295         multiple files would have to be downloaded to the same file on disk.
296         """
297         pass
298
299
300 class PostProcessingError(Exception):
301         """Post Processing exception.
302
303         This exception may be raised by PostProcessor's .run() method to
304         indicate an error in the postprocessing task.
305         """
306         pass
307
308
309 class UnavailableVideoError(Exception):
310         """Unavailable Format exception.
311
312         This exception will be thrown when a video is requested
313         in a format that is not available for that video.
314         """
315         pass
316
317
318 class ContentTooShortError(Exception):
319         """Content Too Short exception.
320
321         This exception may be raised by FileDownloader objects when a file they
322         download is too small for what the server announced first, indicating
323         the connection was probably interrupted.
324         """
325         # Both in bytes
326         downloaded = None
327         expected = None
328
329         def __init__(self, downloaded, expected):
330                 self.downloaded = downloaded
331                 self.expected = expected
332
333
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335         """Handler for HTTP requests and responses.
336
337         This class, when installed with an OpenerDirector, automatically adds
338         the standard headers to every HTTP request and handles gzipped and
339         deflated responses from web servers. If compression is to be avoided in
340         a particular request, the original request in the program code only has
341         to include the HTTP header "Youtubedl-No-Compression", which will be
342         removed before making the real request.
343
344         Part of this code was copied from:
345
346         http://techknack.net/python-urllib2-handlers/
347
348         Andrew Rowls, the author of that code, agreed to release it to the
349         public domain.
350         """
351
352         @staticmethod
353         def deflate(data):
354                 try:
355                         return zlib.decompress(data, -zlib.MAX_WBITS)
356                 except zlib.error:
357                         return zlib.decompress(data)
358
359         @staticmethod
360         def addinfourl_wrapper(stream, headers, url, code):
361                 if hasattr(urllib2.addinfourl, 'getcode'):
362                         return urllib2.addinfourl(stream, headers, url, code)
363                 ret = urllib2.addinfourl(stream, headers, url)
364                 ret.code = code
365                 return ret
366
367         def http_request(self, req):
368                 for h in std_headers:
369                         if h in req.headers:
370                                 del req.headers[h]
371                         req.add_header(h, std_headers[h])
372                 if 'Youtubedl-no-compression' in req.headers:
373                         if 'Accept-encoding' in req.headers:
374                                 del req.headers['Accept-encoding']
375                         del req.headers['Youtubedl-no-compression']
376                 return req
377
378         def http_response(self, req, resp):
379                 old_resp = resp
380                 # gzip
381                 if resp.headers.get('Content-encoding', '') == 'gzip':
382                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384                         resp.msg = old_resp.msg
385                 # deflate
386                 if resp.headers.get('Content-encoding', '') == 'deflate':
387                         gz = StringIO.StringIO(self.deflate(resp.read()))
388                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389                         resp.msg = old_resp.msg
390                 return resp
391
392
393 class FileDownloader(object):
394         """File Downloader class.
395
396         File downloader objects are the ones responsible of downloading the
397         actual video file and writing it to disk if the user has requested
398         it, among some other tasks. In most cases there should be one per
399         program. As, given a video URL, the downloader doesn't know how to
400         extract all the needed information, task that InfoExtractors do, it
401         has to pass the URL to one of them.
402
403         For this, file downloader objects have a method that allows
404         InfoExtractors to be registered in a given order. When it is passed
405         a URL, the file downloader handles it to the first InfoExtractor it
406         finds that reports being able to handle it. The InfoExtractor extracts
407         all the information about the video or videos the URL refers to, and
408         asks the FileDownloader to process the video information, possibly
409         downloading the video.
410
411         File downloaders accept a lot of parameters. In order not to saturate
412         the object constructor with arguments, it receives a dictionary of
413         options instead. These options are available through the params
414         attribute for the InfoExtractors to use. The FileDownloader also
415         registers itself as the downloader in charge for the InfoExtractors
416         that are added to it, so this is a "mutual registration".
417
418         Available options:
419
420         username:         Username for authentication purposes.
421         password:         Password for authentication purposes.
422         usenetrc:         Use netrc for authentication instead.
423         quiet:            Do not print messages to stdout.
424         forceurl:         Force printing final URL.
425         forcetitle:       Force printing title.
426         forcethumbnail:   Force printing thumbnail URL.
427         forcedescription: Force printing description.
428         forcefilename:    Force printing final filename.
429         simulate:         Do not download the video files.
430         format:           Video format code.
431         format_limit:     Highest quality format to try.
432         outtmpl:          Template for output names.
433         ignoreerrors:     Do not stop on download errors.
434         ratelimit:        Download speed limit, in bytes/sec.
435         nooverwrites:     Prevent overwriting files.
436         retries:          Number of times to retry for HTTP error 5xx
437         continuedl:       Try to continue downloads if possible.
438         noprogress:       Do not print the progress bar.
439         playliststart:    Playlist item to start at.
440         playlistend:      Playlist item to end at.
441         matchtitle:       Download only matching titles.
442         rejecttitle:      Reject downloads for matching titles.
443         logtostderr:      Log messages to stderr instead of stdout.
444         consoletitle:     Display progress in console window's titlebar.
445         nopart:           Do not use temporary .part files.
446         updatetime:       Use the Last-modified header to set output file timestamps.
447         writedescription: Write the video description to a .description file
448         writeinfojson:    Write the video description to a .info.json file
449         """
450
451         params = None
452         _ies = []
453         _pps = []
454         _download_retcode = None
455         _num_downloads = None
456         _screen_file = None
457
458         def __init__(self, params):
459                 """Create a FileDownloader object with the given options."""
460                 self._ies = []
461                 self._pps = []
462                 self._download_retcode = 0
463                 self._num_downloads = 0
464                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
465                 self.params = params
466
467         @staticmethod
468         def format_bytes(bytes):
469                 if bytes is None:
470                         return 'N/A'
471                 if type(bytes) is str:
472                         bytes = float(bytes)
473                 if bytes == 0.0:
474                         exponent = 0
475                 else:
476                         exponent = long(math.log(bytes, 1024.0))
477                 suffix = 'bkMGTPEZY'[exponent]
478                 converted = float(bytes) / float(1024 ** exponent)
479                 return '%.2f%s' % (converted, suffix)
480
481         @staticmethod
482         def calc_percent(byte_counter, data_len):
483                 if data_len is None:
484                         return '---.-%'
485                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
487         @staticmethod
488         def calc_eta(start, now, total, current):
489                 if total is None:
490                         return '--:--'
491                 dif = now - start
492                 if current == 0 or dif < 0.001: # One millisecond
493                         return '--:--'
494                 rate = float(current) / dif
495                 eta = long((float(total) - float(current)) / rate)
496                 (eta_mins, eta_secs) = divmod(eta, 60)
497                 if eta_mins > 99:
498                         return '--:--'
499                 return '%02d:%02d' % (eta_mins, eta_secs)
500
501         @staticmethod
502         def calc_speed(start, now, bytes):
503                 dif = now - start
504                 if bytes == 0 or dif < 0.001: # One millisecond
505                         return '%10s' % '---b/s'
506                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
508         @staticmethod
509         def best_block_size(elapsed_time, bytes):
510                 new_min = max(bytes / 2.0, 1.0)
511                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512                 if elapsed_time < 0.001:
513                         return long(new_max)
514                 rate = bytes / elapsed_time
515                 if rate > new_max:
516                         return long(new_max)
517                 if rate < new_min:
518                         return long(new_min)
519                 return long(rate)
520
521         @staticmethod
522         def parse_bytes(bytestr):
523                 """Parse a string indicating a byte quantity into a long integer."""
524                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525                 if matchobj is None:
526                         return None
527                 number = float(matchobj.group(1))
528                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529                 return long(round(number * multiplier))
530
531         def add_info_extractor(self, ie):
532                 """Add an InfoExtractor object to the end of the list."""
533                 self._ies.append(ie)
534                 ie.set_downloader(self)
535
536         def add_post_processor(self, pp):
537                 """Add a PostProcessor object to the end of the chain."""
538                 self._pps.append(pp)
539                 pp.set_downloader(self)
540
541         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542                 """Print message to stdout if not in quiet mode."""
543                 try:
544                         if not self.params.get('quiet', False):
545                                 terminator = [u'\n', u''][skip_eol]
546                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547                         self._screen_file.flush()
548                 except (UnicodeEncodeError), err:
549                         if not ignore_encoding_errors:
550                                 raise
551
552         def to_stderr(self, message):
553                 """Print message to stderr."""
554                 print >>sys.stderr, message.encode(preferredencoding())
555
556         def to_cons_title(self, message):
557                 """Set console/terminal window title to message."""
558                 if not self.params.get('consoletitle', False):
559                         return
560                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561                         # c_wchar_p() might not be necessary if `message` is
562                         # already of type unicode()
563                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564                 elif 'TERM' in os.environ:
565                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
567         def fixed_template(self):
568                 """Checks if the output template is fixed."""
569                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
570
571         def trouble(self, message=None):
572                 """Determine action to take when a download problem appears.
573
574                 Depending on if the downloader has been configured to ignore
575                 download errors or not, this method may throw an exception or
576                 not when errors are found, after printing the message.
577                 """
578                 if message is not None:
579                         self.to_stderr(message)
580                 if not self.params.get('ignoreerrors', False):
581                         raise DownloadError(message)
582                 self._download_retcode = 1
583
584         def slow_down(self, start_time, byte_counter):
585                 """Sleep if the download speed is over the rate limit."""
586                 rate_limit = self.params.get('ratelimit', None)
587                 if rate_limit is None or byte_counter == 0:
588                         return
589                 now = time.time()
590                 elapsed = now - start_time
591                 if elapsed <= 0.0:
592                         return
593                 speed = float(byte_counter) / elapsed
594                 if speed > rate_limit:
595                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
596
597         def temp_name(self, filename):
598                 """Returns a temporary filename for the given filename."""
599                 if self.params.get('nopart', False) or filename == u'-' or \
600                                 (os.path.exists(filename) and not os.path.isfile(filename)):
601                         return filename
602                 return filename + u'.part'
603
604         def undo_temp_name(self, filename):
605                 if filename.endswith(u'.part'):
606                         return filename[:-len(u'.part')]
607                 return filename
608
609         def try_rename(self, old_filename, new_filename):
610                 try:
611                         if old_filename == new_filename:
612                                 return
613                         os.rename(old_filename, new_filename)
614                 except (IOError, OSError), err:
615                         self.trouble(u'ERROR: unable to rename file')
616
617         def try_utime(self, filename, last_modified_hdr):
618                 """Try to set the last-modified time of the given file."""
619                 if last_modified_hdr is None:
620                         return
621                 if not os.path.isfile(filename):
622                         return
623                 timestr = last_modified_hdr
624                 if timestr is None:
625                         return
626                 filetime = timeconvert(timestr)
627                 if filetime is None:
628                         return filetime
629                 try:
630                         os.utime(filename, (time.time(), filetime))
631                 except:
632                         pass
633                 return filetime
634
635         def report_writedescription(self, descfn):
636                 """ Report that the description file is being written """
637                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638
639         def report_writeinfojson(self, infofn):
640                 """ Report that the metadata file has been written """
641                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
642
643         def report_destination(self, filename):
644                 """Report destination filename."""
645                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
646
647         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648                 """Report download progress."""
649                 if self.params.get('noprogress', False):
650                         return
651                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
655
656         def report_resuming_byte(self, resume_len):
657                 """Report attempt to resume at given byte."""
658                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
659
660         def report_retry(self, count, retries):
661                 """Report retry in case of HTTP error 5xx"""
662                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
663
664         def report_file_already_downloaded(self, file_name):
665                 """Report file has already been fully downloaded."""
666                 try:
667                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
668                 except (UnicodeEncodeError), err:
669                         self.to_screen(u'[download] The file has already been downloaded')
670
671         def report_unable_to_resume(self):
672                 """Report it was impossible to resume download."""
673                 self.to_screen(u'[download] Unable to resume')
674
675         def report_finish(self):
676                 """Report download finished."""
677                 if self.params.get('noprogress', False):
678                         self.to_screen(u'[download] Download completed')
679                 else:
680                         self.to_screen(u'')
681
682         def increment_downloads(self):
683                 """Increment the ordinal that assigns a number to each file."""
684                 self._num_downloads += 1
685
686         def prepare_filename(self, info_dict):
687                 """Generate the output filename."""
688                 try:
689                         template_dict = dict(info_dict)
690                         template_dict['epoch'] = unicode(long(time.time()))
691                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692                         filename = self.params['outtmpl'] % template_dict
693                         return filename
694                 except (ValueError, KeyError), err:
695                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
696                         return None
697
698         def process_info(self, info_dict):
699                 """Process a single dictionary returned by an InfoExtractor."""
700                 filename = self.prepare_filename(info_dict)
701                 
702                 # Forced printings
703                 if self.params.get('forcetitle', False):
704                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705                 if self.params.get('forceurl', False):
706                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709                 if self.params.get('forcedescription', False) and 'description' in info_dict:
710                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711                 if self.params.get('forcefilename', False) and filename is not None:
712                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713                 if self.params.get('forceformat', False):
714                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
715
716                 # Do nothing else if in simulate mode
717                 if self.params.get('simulate', False):
718                         return
719
720                 if filename is None:
721                         return
722
723                 matchtitle=self.params.get('matchtitle',False)
724                 rejecttitle=self.params.get('rejecttitle',False)
725                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
728                         return
729                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
731                         return
732                         
733                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734                         self.to_stderr(u'WARNING: file exists and will be skipped')
735                         return
736
737                 try:
738                         dn = os.path.dirname(filename)
739                         if dn != '' and not os.path.exists(dn):
740                                 os.makedirs(dn)
741                 except (OSError, IOError), err:
742                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
743                         return
744
745                 if self.params.get('writedescription', False):
746                         try:
747                                 descfn = filename + '.description'
748                                 self.report_writedescription(descfn)
749                                 descfile = open(descfn, 'wb')
750                                 try:
751                                         descfile.write(info_dict['description'].encode('utf-8'))
752                                 finally:
753                                         descfile.close()
754                         except (OSError, IOError):
755                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
756                                 return
757
758                 if self.params.get('writeinfojson', False):
759                         infofn = filename + '.info.json'
760                         self.report_writeinfojson(infofn)
761                         try:
762                                 json.dump
763                         except (NameError,AttributeError):
764                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
765                                 return
766                         try:
767                                 infof = open(infofn, 'wb')
768                                 try:
769                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
770                                         json.dump(json_info_dict, infof)
771                                 finally:
772                                         infof.close()
773                         except (OSError, IOError):
774                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
775                                 return
776
777                 if not self.params.get('skip_download', False):
778                         try:
779                                 success = self._do_download(filename, info_dict)
780                         except (OSError, IOError), err:
781                                 raise UnavailableVideoError
782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
784                                 return
785                         except (ContentTooShortError, ), err:
786                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
787                                 return
788         
789                         if success:
790                                 try:
791                                         self.post_process(filename, info_dict)
792                                 except (PostProcessingError), err:
793                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
794                                         return
795
796         def download(self, url_list):
797                 """Download a given list of URLs."""
798                 if len(url_list) > 1 and self.fixed_template():
799                         raise SameFileError(self.params['outtmpl'])
800
801                 for url in url_list:
802                         suitable_found = False
803                         for ie in self._ies:
804                                 # Go to next InfoExtractor if not suitable
805                                 if not ie.suitable(url):
806                                         continue
807
808                                 # Suitable InfoExtractor found
809                                 suitable_found = True
810
811                                 # Extract information from URL and process it
812                                 ie.extract(url)
813
814                                 # Suitable InfoExtractor had been found; go to next URL
815                                 break
816
817                         if not suitable_found:
818                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
819
820                 return self._download_retcode
821
822         def post_process(self, filename, ie_info):
823                 """Run the postprocessing chain on the given file."""
824                 info = dict(ie_info)
825                 info['filepath'] = filename
826                 for pp in self._pps:
827                         info = pp.run(info)
828                         if info is None:
829                                 break
830
831         def _download_with_rtmpdump(self, filename, url, player_url):
832                 self.report_destination(filename)
833                 tmpfilename = self.temp_name(filename)
834
835                 # Check for rtmpdump first
836                 try:
837                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
838                 except (OSError, IOError):
839                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
840                         return False
841
842                 # Download using rtmpdump. rtmpdump returns exit code 2 when
843                 # the connection was interrumpted and resuming appears to be
844                 # possible. This is part of rtmpdump's normal usage, AFAIK.
845                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
846                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
847                 while retval == 2 or retval == 1:
848                         prevsize = os.path.getsize(tmpfilename)
849                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
850                         time.sleep(5.0) # This seems to be needed
851                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
852                         cursize = os.path.getsize(tmpfilename)
853                         if prevsize == cursize and retval == 1:
854                                 break
855                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
856                         if prevsize == cursize and retval == 2 and cursize > 1024:
857                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
858                                 retval = 0
859                                 break
860                 if retval == 0:
861                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
862                         self.try_rename(tmpfilename, filename)
863                         return True
864                 else:
865                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
866                         return False
867
868         def _do_download(self, filename, info_dict):
869                 url = info_dict['url']
870                 player_url = info_dict.get('player_url', None)
871
872                 # Check file already present
873                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
874                         self.report_file_already_downloaded(filename)
875                         return True
876
877                 # Attempt to download using rtmpdump
878                 if url.startswith('rtmp'):
879                         return self._download_with_rtmpdump(filename, url, player_url)
880
881                 tmpfilename = self.temp_name(filename)
882                 stream = None
883
884                 # Do not include the Accept-Encoding header
885                 headers = {'Youtubedl-no-compression': 'True'}
886                 basic_request = urllib2.Request(url, None, headers)
887                 request = urllib2.Request(url, None, headers)
888
889                 # Establish possible resume length
890                 if os.path.isfile(tmpfilename):
891                         resume_len = os.path.getsize(tmpfilename)
892                 else:
893                         resume_len = 0
894
895                 open_mode = 'wb'
896                 if resume_len != 0:
897                         if self.params.get('continuedl', False):
898                                 self.report_resuming_byte(resume_len)
899                                 request.add_header('Range','bytes=%d-' % resume_len)
900                                 open_mode = 'ab'
901                         else:
902                                 resume_len = 0
903
904                 count = 0
905                 retries = self.params.get('retries', 0)
906                 while count <= retries:
907                         # Establish connection
908                         try:
909                                 if count == 0 and 'urlhandle' in info_dict:
910                                         data = info_dict['urlhandle']
911                                 data = urllib2.urlopen(request)
912                                 break
913                         except (urllib2.HTTPError, ), err:
914                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
915                                         # Unexpected HTTP error
916                                         raise
917                                 elif err.code == 416:
918                                         # Unable to resume (requested range not satisfiable)
919                                         try:
920                                                 # Open the connection again without the range header
921                                                 data = urllib2.urlopen(basic_request)
922                                                 content_length = data.info()['Content-Length']
923                                         except (urllib2.HTTPError, ), err:
924                                                 if err.code < 500 or err.code >= 600:
925                                                         raise
926                                         else:
927                                                 # Examine the reported length
928                                                 if (content_length is not None and
929                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
930                                                         # The file had already been fully downloaded.
931                                                         # Explanation to the above condition: in issue #175 it was revealed that
932                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
933                                                         # changing the file size slightly and causing problems for some users. So
934                                                         # I decided to implement a suggested change and consider the file
935                                                         # completely downloaded if the file size differs less than 100 bytes from
936                                                         # the one in the hard drive.
937                                                         self.report_file_already_downloaded(filename)
938                                                         self.try_rename(tmpfilename, filename)
939                                                         return True
940                                                 else:
941                                                         # The length does not match, we start the download over
942                                                         self.report_unable_to_resume()
943                                                         open_mode = 'wb'
944                                                         break
945                         # Retry
946                         count += 1
947                         if count <= retries:
948                                 self.report_retry(count, retries)
949
950                 if count > retries:
951                         self.trouble(u'ERROR: giving up after %s retries' % retries)
952                         return False
953
954                 data_len = data.info().get('Content-length', None)
955                 if data_len is not None:
956                         data_len = long(data_len) + resume_len
957                 data_len_str = self.format_bytes(data_len)
958                 byte_counter = 0 + resume_len
959                 block_size = 1024
960                 start = time.time()
961                 while True:
962                         # Download and write
963                         before = time.time()
964                         data_block = data.read(block_size)
965                         after = time.time()
966                         if len(data_block) == 0:
967                                 break
968                         byte_counter += len(data_block)
969
970                         # Open file just in time
971                         if stream is None:
972                                 try:
973                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
974                                         assert stream is not None
975                                         filename = self.undo_temp_name(tmpfilename)
976                                         self.report_destination(filename)
977                                 except (OSError, IOError), err:
978                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
979                                         return False
980                         try:
981                                 stream.write(data_block)
982                         except (IOError, OSError), err:
983                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
984                                 return False
985                         block_size = self.best_block_size(after - before, len(data_block))
986
987                         # Progress message
988                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
989                         if data_len is None:
990                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
991                         else:
992                                 percent_str = self.calc_percent(byte_counter, data_len)
993                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
994                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
995
996                         # Apply rate limit
997                         self.slow_down(start, byte_counter - resume_len)
998
999                 if stream is None:
1000                         self.trouble(u'\nERROR: Did not get any data blocks')
1001                         return False
1002                 stream.close()
1003                 self.report_finish()
1004                 if data_len is not None and byte_counter != data_len:
1005                         raise ContentTooShortError(byte_counter, long(data_len))
1006                 self.try_rename(tmpfilename, filename)
1007
1008                 # Update file modification time
1009                 if self.params.get('updatetime', True):
1010                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1011
1012                 return True
1013
1014
1015 class InfoExtractor(object):
1016         """Information Extractor class.
1017
1018         Information extractors are the classes that, given a URL, extract
1019         information from the video (or videos) the URL refers to. This
1020         information includes the real video URL, the video title and simplified
1021         title, author and others. The information is stored in a dictionary
1022         which is then passed to the FileDownloader. The FileDownloader
1023         processes this information possibly downloading the video to the file
1024         system, among other possible outcomes. The dictionaries must include
1025         the following fields:
1026
1027         id:             Video identifier.
1028         url:            Final video URL.
1029         uploader:       Nickname of the video uploader.
1030         title:          Literal title.
1031         stitle:         Simplified title.
1032         ext:            Video filename extension.
1033         format:         Video format.
1034         player_url:     SWF Player URL (may be None).
1035
1036         The following fields are optional. Their primary purpose is to allow
1037         youtube-dl to serve as the backend for a video search function, such
1038         as the one in youtube2mp3.  They are only used when their respective
1039         forced printing functions are called:
1040
1041         thumbnail:      Full URL to a video thumbnail image.
1042         description:    One-line video description.
1043
1044         Subclasses of this one should re-define the _real_initialize() and
1045         _real_extract() methods and define a _VALID_URL regexp.
1046         Probably, they should also be added to the list of extractors.
1047         """
1048
1049         _ready = False
1050         _downloader = None
1051
1052         def __init__(self, downloader=None):
1053                 """Constructor. Receives an optional downloader."""
1054                 self._ready = False
1055                 self.set_downloader(downloader)
1056
1057         def suitable(self, url):
1058                 """Receives a URL and returns True if suitable for this IE."""
1059                 return re.match(self._VALID_URL, url) is not None
1060
1061         def initialize(self):
1062                 """Initializes an instance (authentication, etc)."""
1063                 if not self._ready:
1064                         self._real_initialize()
1065                         self._ready = True
1066
1067         def extract(self, url):
1068                 """Extracts URL information and returns it in list of dicts."""
1069                 self.initialize()
1070                 return self._real_extract(url)
1071
1072         def set_downloader(self, downloader):
1073                 """Sets the downloader for this IE."""
1074                 self._downloader = downloader
1075
1076         def _real_initialize(self):
1077                 """Real initialization process. Redefine in subclasses."""
1078                 pass
1079
1080         def _real_extract(self, url):
1081                 """Real extraction process. Redefine in subclasses."""
1082                 pass
1083
1084
1085 class YoutubeIE(InfoExtractor):
1086         """Information extractor for youtube.com."""
1087
1088         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1089         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1090         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1091         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1092         _NETRC_MACHINE = 'youtube'
1093         # Listed in order of quality
1094         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1095         _video_extensions = {
1096                 '13': '3gp',
1097                 '17': 'mp4',
1098                 '18': 'mp4',
1099                 '22': 'mp4',
1100                 '37': 'mp4',
1101                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1102                 '43': 'webm',
1103                 '44': 'webm',
1104                 '45': 'webm',
1105         }
1106         _video_dimensions = {
1107                 '5': '240x400',
1108                 '6': '???',
1109                 '13': '???',
1110                 '17': '144x176',
1111                 '18': '360x640',
1112                 '22': '720x1280',
1113                 '34': '360x640',
1114                 '35': '480x854',
1115                 '37': '1080x1920',
1116                 '38': '3072x4096',
1117                 '43': '360x640',
1118                 '44': '480x854',
1119                 '45': '720x1280',
1120         }       
1121         IE_NAME = u'youtube'
1122
1123         def report_lang(self):
1124                 """Report attempt to set language."""
1125                 self._downloader.to_screen(u'[youtube] Setting language')
1126
1127         def report_login(self):
1128                 """Report attempt to log in."""
1129                 self._downloader.to_screen(u'[youtube] Logging in')
1130
1131         def report_age_confirmation(self):
1132                 """Report attempt to confirm age."""
1133                 self._downloader.to_screen(u'[youtube] Confirming age')
1134
1135         def report_video_webpage_download(self, video_id):
1136                 """Report attempt to download video webpage."""
1137                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1138
1139         def report_video_info_webpage_download(self, video_id):
1140                 """Report attempt to download video info webpage."""
1141                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1142
1143         def report_information_extraction(self, video_id):
1144                 """Report attempt to extract video information."""
1145                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1146
1147         def report_unavailable_format(self, video_id, format):
1148                 """Report extracted video URL."""
1149                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1150
1151         def report_rtmp_download(self):
1152                 """Indicate the download will use the RTMP protocol."""
1153                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1154
1155         def _print_formats(self, formats):
1156                 print 'Available formats:'
1157                 for x in formats:
1158                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1159
1160         def _real_initialize(self):
1161                 if self._downloader is None:
1162                         return
1163
1164                 username = None
1165                 password = None
1166                 downloader_params = self._downloader.params
1167
1168                 # Attempt to use provided username and password or .netrc data
1169                 if downloader_params.get('username', None) is not None:
1170                         username = downloader_params['username']
1171                         password = downloader_params['password']
1172                 elif downloader_params.get('usenetrc', False):
1173                         try:
1174                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1175                                 if info is not None:
1176                                         username = info[0]
1177                                         password = info[2]
1178                                 else:
1179                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1180                         except (IOError, netrc.NetrcParseError), err:
1181                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1182                                 return
1183
1184                 # Set language
1185                 request = urllib2.Request(self._LANG_URL)
1186                 try:
1187                         self.report_lang()
1188                         urllib2.urlopen(request).read()
1189                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1191                         return
1192
1193                 # No authentication to be performed
1194                 if username is None:
1195                         return
1196
1197                 # Log in
1198                 login_form = {
1199                                 'current_form': 'loginForm',
1200                                 'next':         '/',
1201                                 'action_login': 'Log In',
1202                                 'username':     username,
1203                                 'password':     password,
1204                                 }
1205                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1206                 try:
1207                         self.report_login()
1208                         login_results = urllib2.urlopen(request).read()
1209                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1210                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1211                                 return
1212                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1213                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1214                         return
1215
1216                 # Confirm age
1217                 age_form = {
1218                                 'next_url':             '/',
1219                                 'action_confirm':       'Confirm',
1220                                 }
1221                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1222                 try:
1223                         self.report_age_confirmation()
1224                         age_results = urllib2.urlopen(request).read()
1225                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1226                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1227                         return
1228
1229         def _real_extract(self, url):
1230                 # Extract video id from URL
1231                 mobj = re.match(self._VALID_URL, url)
1232                 if mobj is None:
1233                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1234                         return
1235                 video_id = mobj.group(2)
1236
1237                 # Get video webpage
1238                 self.report_video_webpage_download(video_id)
1239                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1240                 try:
1241                         video_webpage = urllib2.urlopen(request).read()
1242                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1244                         return
1245
1246                 # Attempt to extract SWF player URL
1247                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1248                 if mobj is not None:
1249                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1250                 else:
1251                         player_url = None
1252
1253                 # Get video info
1254                 self.report_video_info_webpage_download(video_id)
1255                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1256                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1257                                         % (video_id, el_type))
1258                         request = urllib2.Request(video_info_url)
1259                         try:
1260                                 video_info_webpage = urllib2.urlopen(request).read()
1261                                 video_info = parse_qs(video_info_webpage)
1262                                 if 'token' in video_info:
1263                                         break
1264                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1265                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1266                                 return
1267                 if 'token' not in video_info:
1268                         if 'reason' in video_info:
1269                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1270                         else:
1271                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1272                         return
1273
1274                 # Start extracting information
1275                 self.report_information_extraction(video_id)
1276
1277                 # uploader
1278                 if 'author' not in video_info:
1279                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1280                         return
1281                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1282
1283                 # title
1284                 if 'title' not in video_info:
1285                         self._downloader.trouble(u'ERROR: unable to extract video title')
1286                         return
1287                 video_title = urllib.unquote_plus(video_info['title'][0])
1288                 video_title = video_title.decode('utf-8')
1289                 video_title = sanitize_title(video_title)
1290
1291                 # simplified title
1292                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1293                 simple_title = simple_title.strip(ur'_')
1294
1295                 # thumbnail image
1296                 if 'thumbnail_url' not in video_info:
1297                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1298                         video_thumbnail = ''
1299                 else:   # don't panic if we can't find it
1300                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1301
1302                 # upload date
1303                 upload_date = u'NA'
1304                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1305                 if mobj is not None:
1306                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1307                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1308                         for expression in format_expressions:
1309                                 try:
1310                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1311                                 except:
1312                                         pass
1313
1314                 # description
1315                 try:
1316                         lxml.etree
1317                 except NameError:
1318                         video_description = u'No description available.'
1319                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1320                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1321                                 if mobj is not None:
1322                                         video_description = mobj.group(1).decode('utf-8')
1323                 else:
1324                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1325                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1326                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1327                         # TODO use another parser
1328
1329                 # token
1330                 video_token = urllib.unquote_plus(video_info['token'][0])
1331
1332                 # Decide which formats to download
1333                 req_format = self._downloader.params.get('format', None)
1334
1335                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1336                         self.report_rtmp_download()
1337                         video_url_list = [(None, video_info['conn'][0])]
1338                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1339                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1340                         url_data = [parse_qs(uds) for uds in url_data_strs]
1341                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1342                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1343
1344                         format_limit = self._downloader.params.get('format_limit', None)
1345                         if format_limit is not None and format_limit in self._available_formats:
1346                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1347                         else:
1348                                 format_list = self._available_formats
1349                         existing_formats = [x for x in format_list if x in url_map]
1350                         if len(existing_formats) == 0:
1351                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1352                                 return
1353                         if self._downloader.params.get('listformats', None):
1354                                 self._print_formats(existing_formats)
1355                                 return
1356                         if req_format is None or req_format == 'best':
1357                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1358                         elif req_format == 'worst':
1359                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1360                         elif req_format in ('-1', 'all'):
1361                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1362                         else:
1363                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1364                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1365                                 req_formats = req_format.split('/')
1366                                 video_url_list = None
1367                                 for rf in req_formats:
1368                                         if rf in url_map:
1369                                                 video_url_list = [(rf, url_map[rf])]
1370                                                 break
1371                                 if video_url_list is None:
1372                                         self._downloader.trouble(u'ERROR: requested format not available')
1373                                         return
1374                 else:
1375                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1376                         return
1377
1378                 for format_param, video_real_url in video_url_list:
1379                         # At this point we have a new video
1380                         self._downloader.increment_downloads()
1381
1382                         # Extension
1383                         video_extension = self._video_extensions.get(format_param, 'flv')
1384
1385                         try:
1386                                 # Process video information
1387                                 self._downloader.process_info({
1388                                         'id':           video_id.decode('utf-8'),
1389                                         'url':          video_real_url.decode('utf-8'),
1390                                         'uploader':     video_uploader.decode('utf-8'),
1391                                         'upload_date':  upload_date,
1392                                         'title':        video_title,
1393                                         'stitle':       simple_title,
1394                                         'ext':          video_extension.decode('utf-8'),
1395                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1396                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1397                                         'description':  video_description,
1398                                         'player_url':   player_url,
1399                                 })
1400                         except UnavailableVideoError, err:
1401                                 self._downloader.trouble(u'\nERROR: unable to download video')
1402
1403
1404 class MetacafeIE(InfoExtractor):
1405         """Information Extractor for metacafe.com."""
1406
1407         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1408         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1409         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1410         _youtube_ie = None
1411         IE_NAME = u'metacafe'
1412
1413         def __init__(self, youtube_ie, downloader=None):
1414                 InfoExtractor.__init__(self, downloader)
1415                 self._youtube_ie = youtube_ie
1416
1417         def report_disclaimer(self):
1418                 """Report disclaimer retrieval."""
1419                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1420
1421         def report_age_confirmation(self):
1422                 """Report attempt to confirm age."""
1423                 self._downloader.to_screen(u'[metacafe] Confirming age')
1424
1425         def report_download_webpage(self, video_id):
1426                 """Report webpage download."""
1427                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1428
1429         def report_extraction(self, video_id):
1430                 """Report information extraction."""
1431                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1432
1433         def _real_initialize(self):
1434                 # Retrieve disclaimer
1435                 request = urllib2.Request(self._DISCLAIMER)
1436                 try:
1437                         self.report_disclaimer()
1438                         disclaimer = urllib2.urlopen(request).read()
1439                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1440                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1441                         return
1442
1443                 # Confirm age
1444                 disclaimer_form = {
1445                         'filters': '0',
1446                         'submit': "Continue - I'm over 18",
1447                         }
1448                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1449                 try:
1450                         self.report_age_confirmation()
1451                         disclaimer = urllib2.urlopen(request).read()
1452                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1453                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1454                         return
1455
1456         def _real_extract(self, url):
1457                 # Extract id and simplified title from URL
1458                 mobj = re.match(self._VALID_URL, url)
1459                 if mobj is None:
1460                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1461                         return
1462
1463                 video_id = mobj.group(1)
1464
1465                 # Check if video comes from YouTube
1466                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1467                 if mobj2 is not None:
1468                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1469                         return
1470
1471                 # At this point we have a new video
1472                 self._downloader.increment_downloads()
1473
1474                 simple_title = mobj.group(2).decode('utf-8')
1475
1476                 # Retrieve video webpage to extract further information
1477                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1478                 try:
1479                         self.report_download_webpage(video_id)
1480                         webpage = urllib2.urlopen(request).read()
1481                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1482                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1483                         return
1484
1485                 # Extract URL, uploader and title from webpage
1486                 self.report_extraction(video_id)
1487                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1488                 if mobj is not None:
1489                         mediaURL = urllib.unquote(mobj.group(1))
1490                         video_extension = mediaURL[-3:]
1491
1492                         # Extract gdaKey if available
1493                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1494                         if mobj is None:
1495                                 video_url = mediaURL
1496                         else:
1497                                 gdaKey = mobj.group(1)
1498                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1499                 else:
1500                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1501                         if mobj is None:
1502                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1503                                 return
1504                         vardict = parse_qs(mobj.group(1))
1505                         if 'mediaData' not in vardict:
1506                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1507                                 return
1508                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1509                         if mobj is None:
1510                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1511                                 return
1512                         mediaURL = mobj.group(1).replace('\\/', '/')
1513                         video_extension = mediaURL[-3:]
1514                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1515
1516                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1517                 if mobj is None:
1518                         self._downloader.trouble(u'ERROR: unable to extract title')
1519                         return
1520                 video_title = mobj.group(1).decode('utf-8')
1521                 video_title = sanitize_title(video_title)
1522
1523                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1524                 if mobj is None:
1525                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1526                         return
1527                 video_uploader = mobj.group(1)
1528
1529                 try:
1530                         # Process video information
1531                         self._downloader.process_info({
1532                                 'id':           video_id.decode('utf-8'),
1533                                 'url':          video_url.decode('utf-8'),
1534                                 'uploader':     video_uploader.decode('utf-8'),
1535                                 'upload_date':  u'NA',
1536                                 'title':        video_title,
1537                                 'stitle':       simple_title,
1538                                 'ext':          video_extension.decode('utf-8'),
1539                                 'format':       u'NA',
1540                                 'player_url':   None,
1541                         })
1542                 except UnavailableVideoError:
1543                         self._downloader.trouble(u'\nERROR: unable to download video')
1544
1545
1546 class DailymotionIE(InfoExtractor):
1547         """Information Extractor for Dailymotion"""
1548
1549         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1550         IE_NAME = u'dailymotion'
1551
1552         def __init__(self, downloader=None):
1553                 InfoExtractor.__init__(self, downloader)
1554
1555         def report_download_webpage(self, video_id):
1556                 """Report webpage download."""
1557                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1558
1559         def report_extraction(self, video_id):
1560                 """Report information extraction."""
1561                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1562
1563         def _real_initialize(self):
1564                 return
1565
1566         def _real_extract(self, url):
1567                 # Extract id and simplified title from URL
1568                 mobj = re.match(self._VALID_URL, url)
1569                 if mobj is None:
1570                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1571                         return
1572
1573                 # At this point we have a new video
1574                 self._downloader.increment_downloads()
1575                 video_id = mobj.group(1)
1576
1577                 simple_title = mobj.group(2).decode('utf-8')
1578                 video_extension = 'flv'
1579
1580                 # Retrieve video webpage to extract further information
1581                 request = urllib2.Request(url)
1582                 request.add_header('Cookie', 'family_filter=off')
1583                 try:
1584                         self.report_download_webpage(video_id)
1585                         webpage = urllib2.urlopen(request).read()
1586                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1587                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1588                         return
1589
1590                 # Extract URL, uploader and title from webpage
1591                 self.report_extraction(video_id)
1592                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1593                 if mobj is None:
1594                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1595                         return
1596                 sequence = urllib.unquote(mobj.group(1))
1597                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1598                 if mobj is None:
1599                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1600                         return
1601                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1602
1603                 # if needed add http://www.dailymotion.com/ if relative URL
1604
1605                 video_url = mediaURL
1606
1607                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1608                 if mobj is None:
1609                         self._downloader.trouble(u'ERROR: unable to extract title')
1610                         return
1611                 video_title = mobj.group(1).decode('utf-8')
1612                 video_title = sanitize_title(video_title)
1613
1614                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1615                 if mobj is None:
1616                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1617                         return
1618                 video_uploader = mobj.group(1)
1619
1620                 try:
1621                         # Process video information
1622                         self._downloader.process_info({
1623                                 'id':           video_id.decode('utf-8'),
1624                                 'url':          video_url.decode('utf-8'),
1625                                 'uploader':     video_uploader.decode('utf-8'),
1626                                 'upload_date':  u'NA',
1627                                 'title':        video_title,
1628                                 'stitle':       simple_title,
1629                                 'ext':          video_extension.decode('utf-8'),
1630                                 'format':       u'NA',
1631                                 'player_url':   None,
1632                         })
1633                 except UnavailableVideoError:
1634                         self._downloader.trouble(u'\nERROR: unable to download video')
1635
1636
1637 class GoogleIE(InfoExtractor):
1638         """Information extractor for video.google.com."""
1639
1640         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1641         IE_NAME = u'video.google'
1642
1643         def __init__(self, downloader=None):
1644                 InfoExtractor.__init__(self, downloader)
1645
1646         def report_download_webpage(self, video_id):
1647                 """Report webpage download."""
1648                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1649
1650         def report_extraction(self, video_id):
1651                 """Report information extraction."""
1652                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1653
1654         def _real_initialize(self):
1655                 return
1656
1657         def _real_extract(self, url):
1658                 # Extract id from URL
1659                 mobj = re.match(self._VALID_URL, url)
1660                 if mobj is None:
1661                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1662                         return
1663
1664                 # At this point we have a new video
1665                 self._downloader.increment_downloads()
1666                 video_id = mobj.group(1)
1667
1668                 video_extension = 'mp4'
1669
1670                 # Retrieve video webpage to extract further information
1671                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1672                 try:
1673                         self.report_download_webpage(video_id)
1674                         webpage = urllib2.urlopen(request).read()
1675                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1677                         return
1678
1679                 # Extract URL, uploader, and title from webpage
1680                 self.report_extraction(video_id)
1681                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1682                 if mobj is None:
1683                         video_extension = 'flv'
1684                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1685                 if mobj is None:
1686                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1687                         return
1688                 mediaURL = urllib.unquote(mobj.group(1))
1689                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1690                 mediaURL = mediaURL.replace('\\x26', '\x26')
1691
1692                 video_url = mediaURL
1693
1694                 mobj = re.search(r'<title>(.*)</title>', webpage)
1695                 if mobj is None:
1696                         self._downloader.trouble(u'ERROR: unable to extract title')
1697                         return
1698                 video_title = mobj.group(1).decode('utf-8')
1699                 video_title = sanitize_title(video_title)
1700                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1701
1702                 # Extract video description
1703                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1704                 if mobj is None:
1705                         self._downloader.trouble(u'ERROR: unable to extract video description')
1706                         return
1707                 video_description = mobj.group(1).decode('utf-8')
1708                 if not video_description:
1709                         video_description = 'No description available.'
1710
1711                 # Extract video thumbnail
1712                 if self._downloader.params.get('forcethumbnail', False):
1713                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1714                         try:
1715                                 webpage = urllib2.urlopen(request).read()
1716                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1718                                 return
1719                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1720                         if mobj is None:
1721                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1722                                 return
1723                         video_thumbnail = mobj.group(1)
1724                 else:   # we need something to pass to process_info
1725                         video_thumbnail = ''
1726
1727                 try:
1728                         # Process video information
1729                         self._downloader.process_info({
1730                                 'id':           video_id.decode('utf-8'),
1731                                 'url':          video_url.decode('utf-8'),
1732                                 'uploader':     u'NA',
1733                                 'upload_date':  u'NA',
1734                                 'title':        video_title,
1735                                 'stitle':       simple_title,
1736                                 'ext':          video_extension.decode('utf-8'),
1737                                 'format':       u'NA',
1738                                 'player_url':   None,
1739                         })
1740                 except UnavailableVideoError:
1741                         self._downloader.trouble(u'\nERROR: unable to download video')
1742
1743
1744 class PhotobucketIE(InfoExtractor):
1745         """Information extractor for photobucket.com."""
1746
1747         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1748         IE_NAME = u'photobucket'
1749
1750         def __init__(self, downloader=None):
1751                 InfoExtractor.__init__(self, downloader)
1752
1753         def report_download_webpage(self, video_id):
1754                 """Report webpage download."""
1755                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1756
1757         def report_extraction(self, video_id):
1758                 """Report information extraction."""
1759                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1760
1761         def _real_initialize(self):
1762                 return
1763
1764         def _real_extract(self, url):
1765                 # Extract id from URL
1766                 mobj = re.match(self._VALID_URL, url)
1767                 if mobj is None:
1768                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1769                         return
1770
1771                 # At this point we have a new video
1772                 self._downloader.increment_downloads()
1773                 video_id = mobj.group(1)
1774
1775                 video_extension = 'flv'
1776
1777                 # Retrieve video webpage to extract further information
1778                 request = urllib2.Request(url)
1779                 try:
1780                         self.report_download_webpage(video_id)
1781                         webpage = urllib2.urlopen(request).read()
1782                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1784                         return
1785
1786                 # Extract URL, uploader, and title from webpage
1787                 self.report_extraction(video_id)
1788                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1789                 if mobj is None:
1790                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1791                         return
1792                 mediaURL = urllib.unquote(mobj.group(1))
1793
1794                 video_url = mediaURL
1795
1796                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1797                 if mobj is None:
1798                         self._downloader.trouble(u'ERROR: unable to extract title')
1799                         return
1800                 video_title = mobj.group(1).decode('utf-8')
1801                 video_title = sanitize_title(video_title)
1802                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1803
1804                 video_uploader = mobj.group(2).decode('utf-8')
1805
1806                 try:
1807                         # Process video information
1808                         self._downloader.process_info({
1809                                 'id':           video_id.decode('utf-8'),
1810                                 'url':          video_url.decode('utf-8'),
1811                                 'uploader':     video_uploader,
1812                                 'upload_date':  u'NA',
1813                                 'title':        video_title,
1814                                 'stitle':       simple_title,
1815                                 'ext':          video_extension.decode('utf-8'),
1816                                 'format':       u'NA',
1817                                 'player_url':   None,
1818                         })
1819                 except UnavailableVideoError:
1820                         self._downloader.trouble(u'\nERROR: unable to download video')
1821
1822
1823 class YahooIE(InfoExtractor):
1824         """Information extractor for video.yahoo.com."""
1825
1826         # _VALID_URL matches all Yahoo! Video URLs
1827         # _VPAGE_URL matches only the extractable '/watch/' URLs
1828         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1829         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1830         IE_NAME = u'video.yahoo'
1831
1832         def __init__(self, downloader=None):
1833                 InfoExtractor.__init__(self, downloader)
1834
1835         def report_download_webpage(self, video_id):
1836                 """Report webpage download."""
1837                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1838
1839         def report_extraction(self, video_id):
1840                 """Report information extraction."""
1841                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1842
1843         def _real_initialize(self):
1844                 return
1845
1846         def _real_extract(self, url, new_video=True):
1847                 # Extract ID from URL
1848                 mobj = re.match(self._VALID_URL, url)
1849                 if mobj is None:
1850                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1851                         return
1852
1853                 # At this point we have a new video
1854                 self._downloader.increment_downloads()
1855                 video_id = mobj.group(2)
1856                 video_extension = 'flv'
1857
1858                 # Rewrite valid but non-extractable URLs as
1859                 # extractable English language /watch/ URLs
1860                 if re.match(self._VPAGE_URL, url) is None:
1861                         request = urllib2.Request(url)
1862                         try:
1863                                 webpage = urllib2.urlopen(request).read()
1864                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1865                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1866                                 return
1867
1868                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1869                         if mobj is None:
1870                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1871                                 return
1872                         yahoo_id = mobj.group(1)
1873
1874                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1875                         if mobj is None:
1876                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1877                                 return
1878                         yahoo_vid = mobj.group(1)
1879
1880                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1881                         return self._real_extract(url, new_video=False)
1882
1883                 # Retrieve video webpage to extract further information
1884                 request = urllib2.Request(url)
1885                 try:
1886                         self.report_download_webpage(video_id)
1887                         webpage = urllib2.urlopen(request).read()
1888                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1889                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1890                         return
1891
1892                 # Extract uploader and title from webpage
1893                 self.report_extraction(video_id)
1894                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1895                 if mobj is None:
1896                         self._downloader.trouble(u'ERROR: unable to extract video title')
1897                         return
1898                 video_title = mobj.group(1).decode('utf-8')
1899                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1900
1901                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1902                 if mobj is None:
1903                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1904                         return
1905                 video_uploader = mobj.group(1).decode('utf-8')
1906
1907                 # Extract video thumbnail
1908                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1909                 if mobj is None:
1910                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1911                         return
1912                 video_thumbnail = mobj.group(1).decode('utf-8')
1913
1914                 # Extract video description
1915                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1916                 if mobj is None:
1917                         self._downloader.trouble(u'ERROR: unable to extract video description')
1918                         return
1919                 video_description = mobj.group(1).decode('utf-8')
1920                 if not video_description:
1921                         video_description = 'No description available.'
1922
1923                 # Extract video height and width
1924                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1925                 if mobj is None:
1926                         self._downloader.trouble(u'ERROR: unable to extract video height')
1927                         return
1928                 yv_video_height = mobj.group(1)
1929
1930                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1931                 if mobj is None:
1932                         self._downloader.trouble(u'ERROR: unable to extract video width')
1933                         return
1934                 yv_video_width = mobj.group(1)
1935
1936                 # Retrieve video playlist to extract media URL
1937                 # I'm not completely sure what all these options are, but we
1938                 # seem to need most of them, otherwise the server sends a 401.
1939                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1940                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1941                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1942                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1943                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1944                 try:
1945                         self.report_download_webpage(video_id)
1946                         webpage = urllib2.urlopen(request).read()
1947                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1948                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1949                         return
1950
1951                 # Extract media URL from playlist XML
1952                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1953                 if mobj is None:
1954                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1955                         return
1956                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1957                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1958
1959                 try:
1960                         # Process video information
1961                         self._downloader.process_info({
1962                                 'id':           video_id.decode('utf-8'),
1963                                 'url':          video_url,
1964                                 'uploader':     video_uploader,
1965                                 'upload_date':  u'NA',
1966                                 'title':        video_title,
1967                                 'stitle':       simple_title,
1968                                 'ext':          video_extension.decode('utf-8'),
1969                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1970                                 'description':  video_description,
1971                                 'thumbnail':    video_thumbnail,
1972                                 'player_url':   None,
1973                         })
1974                 except UnavailableVideoError:
1975                         self._downloader.trouble(u'\nERROR: unable to download video')
1976
1977
1978 class VimeoIE(InfoExtractor):
1979         """Information extractor for vimeo.com."""
1980
1981         # _VALID_URL matches Vimeo URLs
1982         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1983         IE_NAME = u'vimeo'
1984
1985         def __init__(self, downloader=None):
1986                 InfoExtractor.__init__(self, downloader)
1987
1988         def report_download_webpage(self, video_id):
1989                 """Report webpage download."""
1990                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1991
1992         def report_extraction(self, video_id):
1993                 """Report information extraction."""
1994                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1995
1996         def _real_initialize(self):
1997                 return
1998
1999         def _real_extract(self, url, new_video=True):
2000                 # Extract ID from URL
2001                 mobj = re.match(self._VALID_URL, url)
2002                 if mobj is None:
2003                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2004                         return
2005
2006                 # At this point we have a new video
2007                 self._downloader.increment_downloads()
2008                 video_id = mobj.group(1)
2009
2010                 # Retrieve video webpage to extract further information
2011                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2012                 try:
2013                         self.report_download_webpage(video_id)
2014                         webpage = urllib2.urlopen(request).read()
2015                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2017                         return
2018
2019                 # Now we begin extracting as much information as we can from what we
2020                 # retrieved. First we extract the information common to all extractors,
2021                 # and latter we extract those that are Vimeo specific.
2022                 self.report_extraction(video_id)
2023
2024                 # Extract title
2025                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2026                 if mobj is None:
2027                         self._downloader.trouble(u'ERROR: unable to extract video title')
2028                         return
2029                 video_title = mobj.group(1).decode('utf-8')
2030                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2031
2032                 # Extract uploader
2033                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2034                 if mobj is None:
2035                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2036                         return
2037                 video_uploader = mobj.group(1).decode('utf-8')
2038
2039                 # Extract video thumbnail
2040                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2041                 if mobj is None:
2042                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2043                         return
2044                 video_thumbnail = mobj.group(1).decode('utf-8')
2045
2046                 # # Extract video description
2047                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2048                 # if mobj is None:
2049                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2050                 #       return
2051                 # video_description = mobj.group(1).decode('utf-8')
2052                 # if not video_description: video_description = 'No description available.'
2053                 video_description = 'Foo.'
2054
2055                 # Vimeo specific: extract request signature
2056                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2057                 if mobj is None:
2058                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2059                         return
2060                 sig = mobj.group(1).decode('utf-8')
2061
2062                 # Vimeo specific: Extract request signature expiration
2063                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2064                 if mobj is None:
2065                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2066                         return
2067                 sig_exp = mobj.group(1).decode('utf-8')
2068
2069                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2070
2071                 try:
2072                         # Process video information
2073                         self._downloader.process_info({
2074                                 'id':           video_id.decode('utf-8'),
2075                                 'url':          video_url,
2076                                 'uploader':     video_uploader,
2077                                 'upload_date':  u'NA',
2078                                 'title':        video_title,
2079                                 'stitle':       simple_title,
2080                                 'ext':          u'mp4',
2081                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2082                                 'description':  video_description,
2083                                 'thumbnail':    video_thumbnail,
2084                                 'description':  video_description,
2085                                 'player_url':   None,
2086                         })
2087                 except UnavailableVideoError:
2088                         self._downloader.trouble(u'ERROR: unable to download video')
2089
2090
2091 class GenericIE(InfoExtractor):
2092         """Generic last-resort information extractor."""
2093
2094         _VALID_URL = r'.*'
2095         IE_NAME = u'generic'
2096
2097         def __init__(self, downloader=None):
2098                 InfoExtractor.__init__(self, downloader)
2099
2100         def report_download_webpage(self, video_id):
2101                 """Report webpage download."""
2102                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2103                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2104
2105         def report_extraction(self, video_id):
2106                 """Report information extraction."""
2107                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2108
2109         def _real_initialize(self):
2110                 return
2111
2112         def _real_extract(self, url):
2113                 # At this point we have a new video
2114                 self._downloader.increment_downloads()
2115
2116                 video_id = url.split('/')[-1]
2117                 request = urllib2.Request(url)
2118                 try:
2119                         self.report_download_webpage(video_id)
2120                         webpage = urllib2.urlopen(request).read()
2121                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2122                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2123                         return
2124                 except ValueError, err:
2125                         # since this is the last-resort InfoExtractor, if
2126                         # this error is thrown, it'll be thrown here
2127                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2128                         return
2129
2130                 self.report_extraction(video_id)
2131                 # Start with something easy: JW Player in SWFObject
2132                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2133                 if mobj is None:
2134                         # Broaden the search a little bit
2135                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2136                 if mobj is None:
2137                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2138                         return
2139
2140                 # It's possible that one of the regexes
2141                 # matched, but returned an empty group:
2142                 if mobj.group(1) is None:
2143                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2144                         return
2145
2146                 video_url = urllib.unquote(mobj.group(1))
2147                 video_id = os.path.basename(video_url)
2148
2149                 # here's a fun little line of code for you:
2150                 video_extension = os.path.splitext(video_id)[1][1:]
2151                 video_id = os.path.splitext(video_id)[0]
2152
2153                 # it's tempting to parse this further, but you would
2154                 # have to take into account all the variations like
2155                 #   Video Title - Site Name
2156                 #   Site Name | Video Title
2157                 #   Video Title - Tagline | Site Name
2158                 # and so on and so forth; it's just not practical
2159                 mobj = re.search(r'<title>(.*)</title>', webpage)
2160                 if mobj is None:
2161                         self._downloader.trouble(u'ERROR: unable to extract title')
2162                         return
2163                 video_title = mobj.group(1).decode('utf-8')
2164                 video_title = sanitize_title(video_title)
2165                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2166
2167                 # video uploader is domain name
2168                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2169                 if mobj is None:
2170                         self._downloader.trouble(u'ERROR: unable to extract title')
2171                         return
2172                 video_uploader = mobj.group(1).decode('utf-8')
2173
2174                 try:
2175                         # Process video information
2176                         self._downloader.process_info({
2177                                 'id':           video_id.decode('utf-8'),
2178                                 'url':          video_url.decode('utf-8'),
2179                                 'uploader':     video_uploader,
2180                                 'upload_date':  u'NA',
2181                                 'title':        video_title,
2182                                 'stitle':       simple_title,
2183                                 'ext':          video_extension.decode('utf-8'),
2184                                 'format':       u'NA',
2185                                 'player_url':   None,
2186                         })
2187                 except UnavailableVideoError, err:
2188                         self._downloader.trouble(u'\nERROR: unable to download video')
2189
2190
2191 class YoutubeSearchIE(InfoExtractor):
2192         """Information Extractor for YouTube search queries."""
2193         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2194         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2195         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2196         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2197         _youtube_ie = None
2198         _max_youtube_results = 1000
2199         IE_NAME = u'youtube:search'
2200
2201         def __init__(self, youtube_ie, downloader=None):
2202                 InfoExtractor.__init__(self, downloader)
2203                 self._youtube_ie = youtube_ie
2204
2205         def report_download_page(self, query, pagenum):
2206                 """Report attempt to download playlist page with given number."""
2207                 query = query.decode(preferredencoding())
2208                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2209
2210         def _real_initialize(self):
2211                 self._youtube_ie.initialize()
2212
2213         def _real_extract(self, query):
2214                 mobj = re.match(self._VALID_URL, query)
2215                 if mobj is None:
2216                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2217                         return
2218
2219                 prefix, query = query.split(':')
2220                 prefix = prefix[8:]
2221                 query = query.encode('utf-8')
2222                 if prefix == '':
2223                         self._download_n_results(query, 1)
2224                         return
2225                 elif prefix == 'all':
2226                         self._download_n_results(query, self._max_youtube_results)
2227                         return
2228                 else:
2229                         try:
2230                                 n = long(prefix)
2231                                 if n <= 0:
2232                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2233                                         return
2234                                 elif n > self._max_youtube_results:
2235                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2236                                         n = self._max_youtube_results
2237                                 self._download_n_results(query, n)
2238                                 return
2239                         except ValueError: # parsing prefix as integer fails
2240                                 self._download_n_results(query, 1)
2241                                 return
2242
2243         def _download_n_results(self, query, n):
2244                 """Downloads a specified number of results for a query"""
2245
2246                 video_ids = []
2247                 already_seen = set()
2248                 pagenum = 1
2249
2250                 while True:
2251                         self.report_download_page(query, pagenum)
2252                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2253                         request = urllib2.Request(result_url)
2254                         try:
2255                                 page = urllib2.urlopen(request).read()
2256                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2257                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2258                                 return
2259
2260                         # Extract video identifiers
2261                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2262                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2263                                 if video_id not in already_seen:
2264                                         video_ids.append(video_id)
2265                                         already_seen.add(video_id)
2266                                         if len(video_ids) == n:
2267                                                 # Specified n videos reached
2268                                                 for id in video_ids:
2269                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2270                                                 return
2271
2272                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2273                                 for id in video_ids:
2274                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2275                                 return
2276
2277                         pagenum = pagenum + 1
2278
2279
2280 class GoogleSearchIE(InfoExtractor):
2281         """Information Extractor for Google Video search queries."""
2282         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2283         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2284         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2285         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2286         _google_ie = None
2287         _max_google_results = 1000
2288         IE_NAME = u'video.google:search'
2289
2290         def __init__(self, google_ie, downloader=None):
2291                 InfoExtractor.__init__(self, downloader)
2292                 self._google_ie = google_ie
2293
2294         def report_download_page(self, query, pagenum):
2295                 """Report attempt to download playlist page with given number."""
2296                 query = query.decode(preferredencoding())
2297                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2298
2299         def _real_initialize(self):
2300                 self._google_ie.initialize()
2301
2302         def _real_extract(self, query):
2303                 mobj = re.match(self._VALID_URL, query)
2304                 if mobj is None:
2305                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2306                         return
2307
2308                 prefix, query = query.split(':')
2309                 prefix = prefix[8:]
2310                 query = query.encode('utf-8')
2311                 if prefix == '':
2312                         self._download_n_results(query, 1)
2313                         return
2314                 elif prefix == 'all':
2315                         self._download_n_results(query, self._max_google_results)
2316                         return
2317                 else:
2318                         try:
2319                                 n = long(prefix)
2320                                 if n <= 0:
2321                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2322                                         return
2323                                 elif n > self._max_google_results:
2324                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2325                                         n = self._max_google_results
2326                                 self._download_n_results(query, n)
2327                                 return
2328                         except ValueError: # parsing prefix as integer fails
2329                                 self._download_n_results(query, 1)
2330                                 return
2331
2332         def _download_n_results(self, query, n):
2333                 """Downloads a specified number of results for a query"""
2334
2335                 video_ids = []
2336                 already_seen = set()
2337                 pagenum = 1
2338
2339                 while True:
2340                         self.report_download_page(query, pagenum)
2341                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2342                         request = urllib2.Request(result_url)
2343                         try:
2344                                 page = urllib2.urlopen(request).read()
2345                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2346                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2347                                 return
2348
2349                         # Extract video identifiers
2350                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2351                                 video_id = mobj.group(1)
2352                                 if video_id not in already_seen:
2353                                         video_ids.append(video_id)
2354                                         already_seen.add(video_id)
2355                                         if len(video_ids) == n:
2356                                                 # Specified n videos reached
2357                                                 for id in video_ids:
2358                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2359                                                 return
2360
2361                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2362                                 for id in video_ids:
2363                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2364                                 return
2365
2366                         pagenum = pagenum + 1
2367
2368
2369 class YahooSearchIE(InfoExtractor):
2370         """Information Extractor for Yahoo! Video search queries."""
2371         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2372         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2373         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2374         _MORE_PAGES_INDICATOR = r'\s*Next'
2375         _yahoo_ie = None
2376         _max_yahoo_results = 1000
2377         IE_NAME = u'video.yahoo:search'
2378
2379         def __init__(self, yahoo_ie, downloader=None):
2380                 InfoExtractor.__init__(self, downloader)
2381                 self._yahoo_ie = yahoo_ie
2382
2383         def report_download_page(self, query, pagenum):
2384                 """Report attempt to download playlist page with given number."""
2385                 query = query.decode(preferredencoding())
2386                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2387
2388         def _real_initialize(self):
2389                 self._yahoo_ie.initialize()
2390
2391         def _real_extract(self, query):
2392                 mobj = re.match(self._VALID_URL, query)
2393                 if mobj is None:
2394                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2395                         return
2396
2397                 prefix, query = query.split(':')
2398                 prefix = prefix[8:]
2399                 query = query.encode('utf-8')
2400                 if prefix == '':
2401                         self._download_n_results(query, 1)
2402                         return
2403                 elif prefix == 'all':
2404                         self._download_n_results(query, self._max_yahoo_results)
2405                         return
2406                 else:
2407                         try:
2408                                 n = long(prefix)
2409                                 if n <= 0:
2410                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2411                                         return
2412                                 elif n > self._max_yahoo_results:
2413                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2414                                         n = self._max_yahoo_results
2415                                 self._download_n_results(query, n)
2416                                 return
2417                         except ValueError: # parsing prefix as integer fails
2418                                 self._download_n_results(query, 1)
2419                                 return
2420
2421         def _download_n_results(self, query, n):
2422                 """Downloads a specified number of results for a query"""
2423
2424                 video_ids = []
2425                 already_seen = set()
2426                 pagenum = 1
2427
2428                 while True:
2429                         self.report_download_page(query, pagenum)
2430                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2431                         request = urllib2.Request(result_url)
2432                         try:
2433                                 page = urllib2.urlopen(request).read()
2434                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2435                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2436                                 return
2437
2438                         # Extract video identifiers
2439                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2440                                 video_id = mobj.group(1)
2441                                 if video_id not in already_seen:
2442                                         video_ids.append(video_id)
2443                                         already_seen.add(video_id)
2444                                         if len(video_ids) == n:
2445                                                 # Specified n videos reached
2446                                                 for id in video_ids:
2447                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2448                                                 return
2449
2450                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2451                                 for id in video_ids:
2452                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2453                                 return
2454
2455                         pagenum = pagenum + 1
2456
2457
2458 class YoutubePlaylistIE(InfoExtractor):
2459         """Information Extractor for YouTube playlists."""
2460
2461         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2462         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2463         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2464         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2465         _youtube_ie = None
2466         IE_NAME = u'youtube:playlist'
2467
2468         def __init__(self, youtube_ie, downloader=None):
2469                 InfoExtractor.__init__(self, downloader)
2470                 self._youtube_ie = youtube_ie
2471
2472         def report_download_page(self, playlist_id, pagenum):
2473                 """Report attempt to download playlist page with given number."""
2474                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2475
2476         def _real_initialize(self):
2477                 self._youtube_ie.initialize()
2478
2479         def _real_extract(self, url):
2480                 # Extract playlist id
2481                 mobj = re.match(self._VALID_URL, url)
2482                 if mobj is None:
2483                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2484                         return
2485
2486                 # Single video case
2487                 if mobj.group(3) is not None:
2488                         self._youtube_ie.extract(mobj.group(3))
2489                         return
2490
2491                 # Download playlist pages
2492                 # prefix is 'p' as default for playlists but there are other types that need extra care
2493                 playlist_prefix = mobj.group(1)
2494                 if playlist_prefix == 'a':
2495                         playlist_access = 'artist'
2496                 else:
2497                         playlist_prefix = 'p'
2498                         playlist_access = 'view_play_list'
2499                 playlist_id = mobj.group(2)
2500                 video_ids = []
2501                 pagenum = 1
2502
2503                 while True:
2504                         self.report_download_page(playlist_id, pagenum)
2505                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2506                         try:
2507                                 page = urllib2.urlopen(request).read()
2508                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2509                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2510                                 return
2511
2512                         # Extract video identifiers
2513                         ids_in_page = []
2514                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2515                                 if mobj.group(1) not in ids_in_page:
2516                                         ids_in_page.append(mobj.group(1))
2517                         video_ids.extend(ids_in_page)
2518
2519                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2520                                 break
2521                         pagenum = pagenum + 1
2522
2523                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2524                 playlistend = self._downloader.params.get('playlistend', -1)
2525                 video_ids = video_ids[playliststart:playlistend]
2526
2527                 for id in video_ids:
2528                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2529                 return
2530
2531
2532 class YoutubeUserIE(InfoExtractor):
2533         """Information Extractor for YouTube users."""
2534
2535         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2536         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2537         _GDATA_PAGE_SIZE = 50
2538         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2539         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2540         _youtube_ie = None
2541         IE_NAME = u'youtube:user'
2542
2543         def __init__(self, youtube_ie, downloader=None):
2544                 InfoExtractor.__init__(self, downloader)
2545                 self._youtube_ie = youtube_ie
2546
2547         def report_download_page(self, username, start_index):
2548                 """Report attempt to download user page."""
2549                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2550                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2551
2552         def _real_initialize(self):
2553                 self._youtube_ie.initialize()
2554
2555         def _real_extract(self, url):
2556                 # Extract username
2557                 mobj = re.match(self._VALID_URL, url)
2558                 if mobj is None:
2559                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2560                         return
2561
2562                 username = mobj.group(1)
2563
2564                 # Download video ids using YouTube Data API. Result size per
2565                 # query is limited (currently to 50 videos) so we need to query
2566                 # page by page until there are no video ids - it means we got
2567                 # all of them.
2568
2569                 video_ids = []
2570                 pagenum = 0
2571
2572                 while True:
2573                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2574                         self.report_download_page(username, start_index)
2575
2576                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2577
2578                         try:
2579                                 page = urllib2.urlopen(request).read()
2580                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2581                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2582                                 return
2583
2584                         # Extract video identifiers
2585                         ids_in_page = []
2586
2587                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2588                                 if mobj.group(1) not in ids_in_page:
2589                                         ids_in_page.append(mobj.group(1))
2590
2591                         video_ids.extend(ids_in_page)
2592
2593                         # A little optimization - if current page is not
2594                         # "full", ie. does not contain PAGE_SIZE video ids then
2595                         # we can assume that this page is the last one - there
2596                         # are no more ids on further pages - no need to query
2597                         # again.
2598
2599                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2600                                 break
2601
2602                         pagenum += 1
2603
2604                 all_ids_count = len(video_ids)
2605                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2606                 playlistend = self._downloader.params.get('playlistend', -1)
2607
2608                 if playlistend == -1:
2609                         video_ids = video_ids[playliststart:]
2610                 else:
2611                         video_ids = video_ids[playliststart:playlistend]
2612
2613                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2614                                 (username, all_ids_count, len(video_ids)))
2615
2616                 for video_id in video_ids:
2617                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2618
2619
2620 class DepositFilesIE(InfoExtractor):
2621         """Information extractor for depositfiles.com"""
2622
2623         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2624         IE_NAME = u'DepositFiles'
2625
2626         def __init__(self, downloader=None):
2627                 InfoExtractor.__init__(self, downloader)
2628
2629         def report_download_webpage(self, file_id):
2630                 """Report webpage download."""
2631                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2632
2633         def report_extraction(self, file_id):
2634                 """Report information extraction."""
2635                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2636
2637         def _real_initialize(self):
2638                 return
2639
2640         def _real_extract(self, url):
2641                 # At this point we have a new file
2642                 self._downloader.increment_downloads()
2643
2644                 file_id = url.split('/')[-1]
2645                 # Rebuild url in english locale
2646                 url = 'http://depositfiles.com/en/files/' + file_id
2647
2648                 # Retrieve file webpage with 'Free download' button pressed
2649                 free_download_indication = { 'gateway_result' : '1' }
2650                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2651                 try:
2652                         self.report_download_webpage(file_id)
2653                         webpage = urllib2.urlopen(request).read()
2654                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2655                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2656                         return
2657
2658                 # Search for the real file URL
2659                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2660                 if (mobj is None) or (mobj.group(1) is None):
2661                         # Try to figure out reason of the error.
2662                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2663                         if (mobj is not None) and (mobj.group(1) is not None):
2664                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2665                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2666                         else:
2667                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2668                         return
2669
2670                 file_url = mobj.group(1)
2671                 file_extension = os.path.splitext(file_url)[1][1:]
2672
2673                 # Search for file title
2674                 mobj = re.search(r'<b title="(.*?)">', webpage)
2675                 if mobj is None:
2676                         self._downloader.trouble(u'ERROR: unable to extract title')
2677                         return
2678                 file_title = mobj.group(1).decode('utf-8')
2679
2680                 try:
2681                         # Process file information
2682                         self._downloader.process_info({
2683                                 'id':           file_id.decode('utf-8'),
2684                                 'url':          file_url.decode('utf-8'),
2685                                 'uploader':     u'NA',
2686                                 'upload_date':  u'NA',
2687                                 'title':        file_title,
2688                                 'stitle':       file_title,
2689                                 'ext':          file_extension.decode('utf-8'),
2690                                 'format':       u'NA',
2691                                 'player_url':   None,
2692                         })
2693                 except UnavailableVideoError, err:
2694                         self._downloader.trouble(u'ERROR: unable to download file')
2695
2696
2697 class FacebookIE(InfoExtractor):
2698         """Information Extractor for Facebook"""
2699
2700         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/video/video\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2701         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2702         _NETRC_MACHINE = 'facebook'
2703         _available_formats = ['highqual', 'lowqual']
2704         _video_extensions = {
2705                 'highqual': 'mp4',
2706                 'lowqual': 'mp4',
2707         }
2708         IE_NAME = u'facebook'
2709
2710         def __init__(self, downloader=None):
2711                 InfoExtractor.__init__(self, downloader)
2712
2713         def _reporter(self, message):
2714                 """Add header and report message."""
2715                 self._downloader.to_screen(u'[facebook] %s' % message)
2716
2717         def report_login(self):
2718                 """Report attempt to log in."""
2719                 self._reporter(u'Logging in')
2720
2721         def report_video_webpage_download(self, video_id):
2722                 """Report attempt to download video webpage."""
2723                 self._reporter(u'%s: Downloading video webpage' % video_id)
2724
2725         def report_information_extraction(self, video_id):
2726                 """Report attempt to extract video information."""
2727                 self._reporter(u'%s: Extracting video information' % video_id)
2728
2729         def _parse_page(self, video_webpage):
2730                 """Extract video information from page"""
2731                 # General data
2732                 data = {'title': r'class="video_title datawrap">(.*?)</',
2733                         'description': r'<div class="datawrap">(.*?)</div>',
2734                         'owner': r'\("video_owner_name", "(.*?)"\)',
2735                         'upload_date': r'data-date="(.*?)"',
2736                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2737                         }
2738                 video_info = {}
2739                 for piece in data.keys():
2740                         mobj = re.search(data[piece], video_webpage)
2741                         if mobj is not None:
2742                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2743
2744                 # Video urls
2745                 video_urls = {}
2746                 for fmt in self._available_formats:
2747                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2748                         if mobj is not None:
2749                                 # URL is in a Javascript segment inside an escaped Unicode format within
2750                                 # the generally utf-8 page
2751                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2752                 video_info['video_urls'] = video_urls
2753
2754                 return video_info
2755
2756         def _real_initialize(self):
2757                 if self._downloader is None:
2758                         return
2759
2760                 useremail = None
2761                 password = None
2762                 downloader_params = self._downloader.params
2763
2764                 # Attempt to use provided username and password or .netrc data
2765                 if downloader_params.get('username', None) is not None:
2766                         useremail = downloader_params['username']
2767                         password = downloader_params['password']
2768                 elif downloader_params.get('usenetrc', False):
2769                         try:
2770                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2771                                 if info is not None:
2772                                         useremail = info[0]
2773                                         password = info[2]
2774                                 else:
2775                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2776                         except (IOError, netrc.NetrcParseError), err:
2777                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2778                                 return
2779
2780                 if useremail is None:
2781                         return
2782
2783                 # Log in
2784                 login_form = {
2785                         'email': useremail,
2786                         'pass': password,
2787                         'login': 'Log+In'
2788                         }
2789                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2790                 try:
2791                         self.report_login()
2792                         login_results = urllib2.urlopen(request).read()
2793                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2794                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2795                                 return
2796                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2797                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2798                         return
2799
2800         def _real_extract(self, url):
2801                 mobj = re.match(self._VALID_URL, url)
2802                 if mobj is None:
2803                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2804                         return
2805                 video_id = mobj.group('ID')
2806
2807                 # Get video webpage
2808                 self.report_video_webpage_download(video_id)
2809                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2810                 try:
2811                         page = urllib2.urlopen(request)
2812                         video_webpage = page.read()
2813                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2814                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2815                         return
2816
2817                 # Start extracting information
2818                 self.report_information_extraction(video_id)
2819
2820                 # Extract information
2821                 video_info = self._parse_page(video_webpage)
2822
2823                 # uploader
2824                 if 'owner' not in video_info:
2825                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2826                         return
2827                 video_uploader = video_info['owner']
2828
2829                 # title
2830                 if 'title' not in video_info:
2831                         self._downloader.trouble(u'ERROR: unable to extract video title')
2832                         return
2833                 video_title = video_info['title']
2834                 video_title = video_title.decode('utf-8')
2835                 video_title = sanitize_title(video_title)
2836
2837                 # simplified title
2838                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2839                 simple_title = simple_title.strip(ur'_')
2840
2841                 # thumbnail image
2842                 if 'thumbnail' not in video_info:
2843                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2844                         video_thumbnail = ''
2845                 else:
2846                         video_thumbnail = video_info['thumbnail']
2847
2848                 # upload date
2849                 upload_date = u'NA'
2850                 if 'upload_date' in video_info:
2851                         upload_time = video_info['upload_date']
2852                         timetuple = email.utils.parsedate_tz(upload_time)
2853                         if timetuple is not None:
2854                                 try:
2855                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2856                                 except:
2857                                         pass
2858
2859                 # description
2860                 video_description = video_info.get('description', 'No description available.')
2861
2862                 url_map = video_info['video_urls']
2863                 if len(url_map.keys()) > 0:
2864                         # Decide which formats to download
2865                         req_format = self._downloader.params.get('format', None)
2866                         format_limit = self._downloader.params.get('format_limit', None)
2867
2868                         if format_limit is not None and format_limit in self._available_formats:
2869                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2870                         else:
2871                                 format_list = self._available_formats
2872                         existing_formats = [x for x in format_list if x in url_map]
2873                         if len(existing_formats) == 0:
2874                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2875                                 return
2876                         if req_format is None:
2877                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2878                         elif req_format == 'worst':
2879                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2880                         elif req_format == '-1':
2881                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2882                         else:
2883                                 # Specific format
2884                                 if req_format not in url_map:
2885                                         self._downloader.trouble(u'ERROR: requested format not available')
2886                                         return
2887                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2888
2889                 for format_param, video_real_url in video_url_list:
2890
2891                         # At this point we have a new video
2892                         self._downloader.increment_downloads()
2893
2894                         # Extension
2895                         video_extension = self._video_extensions.get(format_param, 'mp4')
2896
2897                         try:
2898                                 # Process video information
2899                                 self._downloader.process_info({
2900                                         'id':           video_id.decode('utf-8'),
2901                                         'url':          video_real_url.decode('utf-8'),
2902                                         'uploader':     video_uploader.decode('utf-8'),
2903                                         'upload_date':  upload_date,
2904                                         'title':        video_title,
2905                                         'stitle':       simple_title,
2906                                         'ext':          video_extension.decode('utf-8'),
2907                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2908                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2909                                         'description':  video_description.decode('utf-8'),
2910                                         'player_url':   None,
2911                                 })
2912                         except UnavailableVideoError, err:
2913                                 self._downloader.trouble(u'\nERROR: unable to download video')
2914
2915 class BlipTVIE(InfoExtractor):
2916         """Information extractor for blip.tv"""
2917
2918         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2919         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2920         IE_NAME = u'blip.tv'
2921
2922         def report_extraction(self, file_id):
2923                 """Report information extraction."""
2924                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2925
2926         def report_direct_download(self, title):
2927                 """Report information extraction."""
2928                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2929
2930         def _simplify_title(self, title):
2931                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2932                 res = res.strip(ur'_')
2933                 return res
2934
2935         def _real_extract(self, url):
2936                 mobj = re.match(self._VALID_URL, url)
2937                 if mobj is None:
2938                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2939                         return
2940
2941                 if '?' in url:
2942                         cchar = '&'
2943                 else:
2944                         cchar = '?'
2945                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2946                 request = urllib2.Request(json_url)
2947                 self.report_extraction(mobj.group(1))
2948                 info = None
2949                 try:
2950                         urlh = urllib2.urlopen(request)
2951                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2952                                 basename = url.split('/')[-1]
2953                                 title,ext = os.path.splitext(basename)
2954                                 ext = ext.replace('.', '')
2955                                 self.report_direct_download(title)
2956                                 info = {
2957                                         'id': title,
2958                                         'url': url,
2959                                         'title': title,
2960                                         'stitle': self._simplify_title(title),
2961                                         'ext': ext,
2962                                         'urlhandle': urlh
2963                                 }
2964                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2965                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2966                         return
2967                 if info is None: # Regular URL
2968                         try:
2969                                 json_code = urlh.read()
2970                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2971                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2972                                 return
2973
2974                         try:
2975                                 json_data = json.loads(json_code)
2976                                 if 'Post' in json_data:
2977                                         data = json_data['Post']
2978                                 else:
2979                                         data = json_data
2980         
2981                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2982                                 video_url = data['media']['url']
2983                                 umobj = re.match(self._URL_EXT, video_url)
2984                                 if umobj is None:
2985                                         raise ValueError('Can not determine filename extension')
2986                                 ext = umobj.group(1)
2987         
2988                                 info = {
2989                                         'id': data['item_id'],
2990                                         'url': video_url,
2991                                         'uploader': data['display_name'],
2992                                         'upload_date': upload_date,
2993                                         'title': data['title'],
2994                                         'stitle': self._simplify_title(data['title']),
2995                                         'ext': ext,
2996                                         'format': data['media']['mimeType'],
2997                                         'thumbnail': data['thumbnailUrl'],
2998                                         'description': data['description'],
2999                                         'player_url': data['embedUrl']
3000                                 }
3001                         except (ValueError,KeyError), err:
3002                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3003                                 return
3004
3005                 self._downloader.increment_downloads()
3006
3007                 try:
3008                         self._downloader.process_info(info)
3009                 except UnavailableVideoError, err:
3010                         self._downloader.trouble(u'\nERROR: unable to download video')
3011
3012
3013 class MyVideoIE(InfoExtractor):
3014         """Information Extractor for myvideo.de."""
3015
3016         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3017         IE_NAME = u'myvideo'
3018
3019         def __init__(self, downloader=None):
3020                 InfoExtractor.__init__(self, downloader)
3021         
3022         def report_download_webpage(self, video_id):
3023                 """Report webpage download."""
3024                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3025
3026         def report_extraction(self, video_id):
3027                 """Report information extraction."""
3028                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3029
3030         def _real_initialize(self):
3031                 return
3032
3033         def _real_extract(self,url):
3034                 mobj = re.match(self._VALID_URL, url)
3035                 if mobj is None:
3036                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3037                         return
3038
3039                 video_id = mobj.group(1)
3040                 simple_title = mobj.group(2).decode('utf-8')
3041                 # should actually not be necessary
3042                 simple_title = sanitize_title(simple_title)
3043                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3044
3045                 # Get video webpage
3046                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3047                 try:
3048                         self.report_download_webpage(video_id)
3049                         webpage = urllib2.urlopen(request).read()
3050                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3051                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3052                         return
3053
3054                 self.report_extraction(video_id)
3055                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3056                                  webpage)
3057                 if mobj is None:
3058                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3059                         return
3060                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3061
3062                 mobj = re.search('<title>([^<]+)</title>', webpage)
3063                 if mobj is None:
3064                         self._downloader.trouble(u'ERROR: unable to extract title')
3065                         return
3066
3067                 video_title = mobj.group(1)
3068                 video_title = sanitize_title(video_title)
3069
3070                 try:
3071                         self._downloader.process_info({
3072                                 'id':           video_id,
3073                                 'url':          video_url,
3074                                 'uploader':     u'NA',
3075                                 'upload_date':  u'NA',
3076                                 'title':        video_title,
3077                                 'stitle':       simple_title,
3078                                 'ext':          u'flv',
3079                                 'format':       u'NA',
3080                                 'player_url':   None,
3081                         })
3082                 except UnavailableVideoError:
3083                         self._downloader.trouble(u'\nERROR: Unable to download video')
3084
3085 class ComedyCentralIE(InfoExtractor):
3086         """Information extractor for The Daily Show and Colbert Report """
3087
3088         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3089         IE_NAME = u'comedycentral'
3090
3091         def report_extraction(self, episode_id):
3092                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3093         
3094         def report_config_download(self, episode_id):
3095                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3096
3097         def report_index_download(self, episode_id):
3098                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3099
3100         def report_player_url(self, episode_id):
3101                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3102
3103         def _simplify_title(self, title):
3104                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3105                 res = res.strip(ur'_')
3106                 return res
3107
3108         def _real_extract(self, url):
3109                 mobj = re.match(self._VALID_URL, url)
3110                 if mobj is None:
3111                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3112                         return
3113
3114                 if mobj.group('shortname'):
3115                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3116                                 url = 'http://www.thedailyshow.com/full-episodes/'
3117                         else:
3118                                 url = 'http://www.colbertnation.com/full-episodes/'
3119                         mobj = re.match(self._VALID_URL, url)
3120                         assert mobj is not None
3121
3122                 dlNewest = not mobj.group('episode')
3123                 if dlNewest:
3124                         epTitle = mobj.group('showname')
3125                 else:
3126                         epTitle = mobj.group('episode')
3127
3128                 req = urllib2.Request(url)
3129                 self.report_extraction(epTitle)
3130                 try:
3131                         htmlHandle = urllib2.urlopen(req)
3132                         html = htmlHandle.read()
3133                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3134                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3135                         return
3136                 if dlNewest:
3137                         url = htmlHandle.geturl()
3138                         mobj = re.match(self._VALID_URL, url)
3139                         if mobj is None:
3140                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3141                                 return
3142                         if mobj.group('episode') == '':
3143                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3144                                 return
3145                         epTitle = mobj.group('episode')
3146
3147                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3148                 if len(mMovieParams) == 0:
3149                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3150                         return
3151
3152                 playerUrl_raw = mMovieParams[0][0]
3153                 self.report_player_url(epTitle)
3154                 try:
3155                         urlHandle = urllib2.urlopen(playerUrl_raw)
3156                         playerUrl = urlHandle.geturl()
3157                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3158                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3159                         return
3160
3161                 uri = mMovieParams[0][1]
3162                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3163                 self.report_index_download(epTitle)
3164                 try:
3165                         indexXml = urllib2.urlopen(indexUrl).read()
3166                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3167                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3168                         return
3169
3170                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3171                 itemEls = idoc.findall('.//item')
3172                 for itemEl in itemEls:
3173                         mediaId = itemEl.findall('./guid')[0].text
3174                         shortMediaId = mediaId.split(':')[-1]
3175                         showId = mediaId.split(':')[-2].replace('.com', '')
3176                         officialTitle = itemEl.findall('./title')[0].text
3177                         officialDate = itemEl.findall('./pubDate')[0].text
3178
3179                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3180                                                 urllib.urlencode({'uri': mediaId}))
3181                         configReq = urllib2.Request(configUrl)
3182                         self.report_config_download(epTitle)
3183                         try:
3184                                 configXml = urllib2.urlopen(configReq).read()
3185                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3186                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3187                                 return
3188
3189                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3190                         turls = []
3191                         for rendition in cdoc.findall('.//rendition'):
3192                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3193                                 turls.append(finfo)
3194
3195                         if len(turls) == 0:
3196                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3197                                 continue
3198
3199                         # For now, just pick the highest bitrate
3200                         format,video_url = turls[-1]
3201
3202                         self._downloader.increment_downloads()
3203
3204                         effTitle = showId + '-' + epTitle
3205                         info = {
3206                                 'id': shortMediaId,
3207                                 'url': video_url,
3208                                 'uploader': showId,
3209                                 'upload_date': officialDate,
3210                                 'title': effTitle,
3211                                 'stitle': self._simplify_title(effTitle),
3212                                 'ext': 'mp4',
3213                                 'format': format,
3214                                 'thumbnail': None,
3215                                 'description': officialTitle,
3216                                 'player_url': playerUrl
3217                         }
3218
3219                         try:
3220                                 self._downloader.process_info(info)
3221                         except UnavailableVideoError, err:
3222                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3223                                 continue
3224
3225
3226 class EscapistIE(InfoExtractor):
3227         """Information extractor for The Escapist """
3228
3229         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3230         IE_NAME = u'escapist'
3231
3232         def report_extraction(self, showName):
3233                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3234
3235         def report_config_download(self, showName):
3236                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3237
3238         def _simplify_title(self, title):
3239                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3240                 res = res.strip(ur'_')
3241                 return res
3242
3243         def _real_extract(self, url):
3244                 htmlParser = HTMLParser.HTMLParser()
3245
3246                 mobj = re.match(self._VALID_URL, url)
3247                 if mobj is None:
3248                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3249                         return
3250                 showName = mobj.group('showname')
3251                 videoId = mobj.group('episode')
3252
3253                 self.report_extraction(showName)
3254                 try:
3255                         webPage = urllib2.urlopen(url).read()
3256                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3257                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3258                         return
3259
3260                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3261                 description = htmlParser.unescape(descMatch.group(1))
3262                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3263                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3264                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3265                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3266                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3267                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3268
3269                 self.report_config_download(showName)
3270                 try:
3271                         configJSON = urllib2.urlopen(configUrl).read()
3272                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3273                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3274                         return
3275
3276                 # Technically, it's JavaScript, not JSON
3277                 configJSON = configJSON.replace("'", '"')
3278
3279                 try:
3280                         config = json.loads(configJSON)
3281                 except (ValueError,), err:
3282                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3283                         return
3284
3285                 playlist = config['playlist']
3286                 videoUrl = playlist[1]['url']
3287
3288                 self._downloader.increment_downloads()
3289                 info = {
3290                         'id': videoId,
3291                         'url': videoUrl,
3292                         'uploader': showName,
3293                         'upload_date': None,
3294                         'title': showName,
3295                         'stitle': self._simplify_title(showName),
3296                         'ext': 'flv',
3297                         'format': 'flv',
3298                         'thumbnail': imgUrl,
3299                         'description': description,
3300                         'player_url': playerUrl,
3301                 }
3302
3303                 try:
3304                         self._downloader.process_info(info)
3305                 except UnavailableVideoError, err:
3306                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3307
3308
3309 class CollegeHumorIE(InfoExtractor):
3310         """Information extractor for collegehumor.com"""
3311
3312         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3313         IE_NAME = u'collegehumor'
3314
3315         def report_webpage(self, video_id):
3316                 """Report information extraction."""
3317                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3318
3319         def report_extraction(self, video_id):
3320                 """Report information extraction."""
3321                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3322
3323         def _simplify_title(self, title):
3324                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3325                 res = res.strip(ur'_')
3326                 return res
3327
3328         def _real_extract(self, url):
3329                 htmlParser = HTMLParser.HTMLParser()
3330
3331                 mobj = re.match(self._VALID_URL, url)
3332                 if mobj is None:
3333                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3334                         return
3335                 video_id = mobj.group('videoid')
3336
3337                 self.report_webpage(video_id)
3338                 request = urllib2.Request(url)
3339                 try:
3340                         webpage = urllib2.urlopen(request).read()
3341                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3342                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3343                         return
3344
3345                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3346                 if m is None:
3347                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3348                         return
3349                 internal_video_id = m.group('internalvideoid')
3350
3351                 info = {
3352                         'id': video_id,
3353                         'internal_id': internal_video_id,
3354                 }
3355
3356                 self.report_extraction(video_id)
3357                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3358                 try:
3359                         metaXml = urllib2.urlopen(xmlUrl).read()
3360                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3361                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3362                         return
3363
3364                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3365                 try:
3366                         videoNode = mdoc.findall('./video')[0]
3367                         info['description'] = videoNode.findall('./description')[0].text
3368                         info['title'] = videoNode.findall('./caption')[0].text
3369                         info['stitle'] = self._simplify_title(info['title'])
3370                         info['url'] = videoNode.findall('./file')[0].text
3371                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3372                         info['ext'] = info['url'].rpartition('.')[2]
3373                         info['format'] = info['ext']
3374                 except IndexError:
3375                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3376                         return
3377
3378                 self._downloader.increment_downloads()
3379
3380                 try:
3381                         self._downloader.process_info(info)
3382                 except UnavailableVideoError, err:
3383                         self._downloader.trouble(u'\nERROR: unable to download video')
3384
3385
3386 class XVideosIE(InfoExtractor):
3387         """Information extractor for xvideos.com"""
3388
3389         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3390         IE_NAME = u'xvideos'
3391
3392         def report_webpage(self, video_id):
3393                 """Report information extraction."""
3394                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3395
3396         def report_extraction(self, video_id):
3397                 """Report information extraction."""
3398                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3399
3400         def _simplify_title(self, title):
3401                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3402                 res = res.strip(ur'_')
3403                 return res
3404
3405         def _real_extract(self, url):
3406                 htmlParser = HTMLParser.HTMLParser()
3407
3408                 mobj = re.match(self._VALID_URL, url)
3409                 if mobj is None:
3410                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3411                         return
3412                 video_id = mobj.group(1).decode('utf-8')
3413
3414                 self.report_webpage(video_id)
3415
3416                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3417                 try:
3418                         webpage = urllib2.urlopen(request).read()
3419                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3420                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3421                         return
3422
3423                 self.report_extraction(video_id)
3424
3425
3426                 # Extract video URL
3427                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3428                 if mobj is None:
3429                         self._downloader.trouble(u'ERROR: unable to extract video title')
3430                         return
3431                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3432
3433
3434                 # Extract title
3435                 mobj = re.search(r'<title>(.*?)</title>', webpage)
3436                 if mobj is None:
3437                         self._downloader.trouble(u'ERROR: unable to extract video title')
3438                         return
3439                 video_title = mobj.group(1).decode('utf-8')
3440
3441
3442                 # Extract video thumbnail
3443                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3444                 if mobj is None:
3445                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3446                         return
3447                 video_thumbnail = mobj.group(1).decode('utf-8')
3448
3449
3450
3451                 self._downloader.increment_downloads()
3452                 info = {
3453                         'id': video_id,
3454                         'url': video_url,
3455                         'uploader': None,
3456                         'upload_date': None,
3457                         'title': video_title,
3458                         'stitle': self._simplify_title(video_title),
3459                         'ext': 'flv',
3460                         'format': 'flv',
3461                         'thumbnail': video_thumbnail,
3462                         'description': None,
3463                         'player_url': None,
3464                 }
3465
3466                 try:
3467                         self._downloader.process_info(info)
3468                 except UnavailableVideoError, err:
3469                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3470
3471
3472 class PostProcessor(object):
3473         """Post Processor class.
3474
3475         PostProcessor objects can be added to downloaders with their
3476         add_post_processor() method. When the downloader has finished a
3477         successful download, it will take its internal chain of PostProcessors
3478         and start calling the run() method on each one of them, first with
3479         an initial argument and then with the returned value of the previous
3480         PostProcessor.
3481
3482         The chain will be stopped if one of them ever returns None or the end
3483         of the chain is reached.
3484
3485         PostProcessor objects follow a "mutual registration" process similar
3486         to InfoExtractor objects.
3487         """
3488
3489         _downloader = None
3490
3491         def __init__(self, downloader=None):
3492                 self._downloader = downloader
3493
3494         def set_downloader(self, downloader):
3495                 """Sets the downloader for this PP."""
3496                 self._downloader = downloader
3497
3498         def run(self, information):
3499                 """Run the PostProcessor.
3500
3501                 The "information" argument is a dictionary like the ones
3502                 composed by InfoExtractors. The only difference is that this
3503                 one has an extra field called "filepath" that points to the
3504                 downloaded file.
3505
3506                 When this method returns None, the postprocessing chain is
3507                 stopped. However, this method may return an information
3508                 dictionary that will be passed to the next postprocessing
3509                 object in the chain. It can be the one it received after
3510                 changing some fields.
3511
3512                 In addition, this method may raise a PostProcessingError
3513                 exception that will be taken into account by the downloader
3514                 it was called from.
3515                 """
3516                 return information # by default, do nothing
3517
3518
3519 class FFmpegExtractAudioPP(PostProcessor):
3520
3521         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3522                 PostProcessor.__init__(self, downloader)
3523                 if preferredcodec is None:
3524                         preferredcodec = 'best'
3525                 self._preferredcodec = preferredcodec
3526                 self._preferredquality = preferredquality
3527                 self._keepvideo = keepvideo
3528
3529         @staticmethod
3530         def get_audio_codec(path):
3531                 try:
3532                         cmd = ['ffprobe', '-show_streams', '--', path]
3533                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3534                         output = handle.communicate()[0]
3535                         if handle.wait() != 0:
3536                                 return None
3537                 except (IOError, OSError):
3538                         return None
3539                 audio_codec = None
3540                 for line in output.split('\n'):
3541                         if line.startswith('codec_name='):
3542                                 audio_codec = line.split('=')[1].strip()
3543                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3544                                 return audio_codec
3545                 return None
3546
3547         @staticmethod
3548         def run_ffmpeg(path, out_path, codec, more_opts):
3549                 try:
3550                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3551                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3552                         return (ret == 0)
3553                 except (IOError, OSError):
3554                         return False
3555
3556         def run(self, information):
3557                 path = information['filepath']
3558
3559                 filecodec = self.get_audio_codec(path)
3560                 if filecodec is None:
3561                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3562                         return None
3563
3564                 more_opts = []
3565                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3566                         if filecodec in ['aac', 'mp3', 'vorbis']:
3567                                 # Lossless if possible
3568                                 acodec = 'copy'
3569                                 extension = filecodec
3570                                 if filecodec == 'aac':
3571                                         more_opts = ['-f', 'adts']
3572                                 if filecodec == 'vorbis':
3573                                         extension = 'ogg'
3574                         else:
3575                                 # MP3 otherwise.
3576                                 acodec = 'libmp3lame'
3577                                 extension = 'mp3'
3578                                 more_opts = []
3579                                 if self._preferredquality is not None:
3580                                         more_opts += ['-ab', self._preferredquality]
3581                 else:
3582                         # We convert the audio (lossy)
3583                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3584                         extension = self._preferredcodec
3585                         more_opts = []
3586                         if self._preferredquality is not None:
3587                                 more_opts += ['-ab', self._preferredquality]
3588                         if self._preferredcodec == 'aac':
3589                                 more_opts += ['-f', 'adts']
3590                         if self._preferredcodec == 'vorbis':
3591                                 extension = 'ogg'
3592
3593                 (prefix, ext) = os.path.splitext(path)
3594                 new_path = prefix + '.' + extension
3595                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3596                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3597
3598                 if not status:
3599                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3600                         return None
3601
3602                 # Try to update the date time for extracted audio file.
3603                 if information.get('filetime') is not None:
3604                         try:
3605                                 os.utime(new_path, (time.time(), information['filetime']))
3606                         except:
3607                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3608
3609                 if not self._keepvideo:
3610                         try:
3611                                 os.remove(path)
3612                         except (IOError, OSError):
3613                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3614                                 return None
3615
3616                 information['filepath'] = new_path
3617                 return information
3618
3619
3620 def updateSelf(downloader, filename):
3621         ''' Update the program file with the latest version from the repository '''
3622         # Note: downloader only used for options
3623         if not os.access(filename, os.W_OK):
3624                 sys.exit('ERROR: no write permissions on %s' % filename)
3625
3626         downloader.to_screen('Updating to latest version...')
3627
3628         try:
3629                 try:
3630                         urlh = urllib.urlopen(UPDATE_URL)
3631                         newcontent = urlh.read()
3632                         
3633                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3634                         if vmatch is not None and vmatch.group(1) == __version__:
3635                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3636                                 return
3637                 finally:
3638                         urlh.close()
3639         except (IOError, OSError), err:
3640                 sys.exit('ERROR: unable to download latest version')
3641
3642         try:
3643                 outf = open(filename, 'wb')
3644                 try:
3645                         outf.write(newcontent)
3646                 finally:
3647                         outf.close()
3648         except (IOError, OSError), err:
3649                 sys.exit('ERROR: unable to overwrite current version')
3650
3651         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3652
3653 def parseOpts():
3654         # Deferred imports
3655         import getpass
3656         import optparse
3657
3658         def _format_option_string(option):
3659                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3660
3661                 opts = []
3662
3663                 if option._short_opts: opts.append(option._short_opts[0])
3664                 if option._long_opts: opts.append(option._long_opts[0])
3665                 if len(opts) > 1: opts.insert(1, ', ')
3666
3667                 if option.takes_value(): opts.append(' %s' % option.metavar)
3668
3669                 return "".join(opts)
3670
3671         def _find_term_columns():
3672                 columns = os.environ.get('COLUMNS', None)
3673                 if columns:
3674                         return int(columns)
3675
3676                 try:
3677                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3678                         out,err = sp.communicate()
3679                         return int(out.split()[1])
3680                 except:
3681                         pass
3682                 return None
3683
3684         max_width = 80
3685         max_help_position = 80
3686
3687         # No need to wrap help messages if we're on a wide console
3688         columns = _find_term_columns()
3689         if columns: max_width = columns
3690
3691         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3692         fmt.format_option_strings = _format_option_string
3693
3694         kw = {
3695                 'version'   : __version__,
3696                 'formatter' : fmt,
3697                 'usage' : '%prog [options] url [url...]',
3698                 'conflict_handler' : 'resolve',
3699         }
3700
3701         parser = optparse.OptionParser(**kw)
3702
3703         # option groups
3704         general        = optparse.OptionGroup(parser, 'General Options')
3705         selection      = optparse.OptionGroup(parser, 'Video Selection')
3706         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3707         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3708         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3709         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3710         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3711
3712         general.add_option('-h', '--help',
3713                         action='help', help='print this help text and exit')
3714         general.add_option('-v', '--version',
3715                         action='version', help='print program version and exit')
3716         general.add_option('-U', '--update',
3717                         action='store_true', dest='update_self', help='update this program to latest version')
3718         general.add_option('-i', '--ignore-errors',
3719                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3720         general.add_option('-r', '--rate-limit',
3721                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3722         general.add_option('-R', '--retries',
3723                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3724         general.add_option('--dump-user-agent',
3725                         action='store_true', dest='dump_user_agent',
3726                         help='display the current browser identification', default=False)
3727         general.add_option('--list-extractors',
3728                         action='store_true', dest='list_extractors',
3729                         help='List all supported extractors and the URLs they would handle', default=False)
3730
3731         selection.add_option('--playlist-start',
3732                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3733         selection.add_option('--playlist-end',
3734                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3735         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3736         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3737
3738         authentication.add_option('-u', '--username',
3739                         dest='username', metavar='USERNAME', help='account username')
3740         authentication.add_option('-p', '--password',
3741                         dest='password', metavar='PASSWORD', help='account password')
3742         authentication.add_option('-n', '--netrc',
3743                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3744
3745
3746         video_format.add_option('-f', '--format',
3747                         action='store', dest='format', metavar='FORMAT', help='video format code')
3748         video_format.add_option('--all-formats',
3749                         action='store_const', dest='format', help='download all available video formats', const='all')
3750         video_format.add_option('--max-quality',
3751                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3752         video_format.add_option('-F', '--list-formats',
3753                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3754
3755
3756         verbosity.add_option('-q', '--quiet',
3757                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3758         verbosity.add_option('-s', '--simulate',
3759                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3760         verbosity.add_option('--skip-download',
3761                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3762         verbosity.add_option('-g', '--get-url',
3763                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3764         verbosity.add_option('-e', '--get-title',
3765                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3766         verbosity.add_option('--get-thumbnail',
3767                         action='store_true', dest='getthumbnail',
3768                         help='simulate, quiet but print thumbnail URL', default=False)
3769         verbosity.add_option('--get-description',
3770                         action='store_true', dest='getdescription',
3771                         help='simulate, quiet but print video description', default=False)
3772         verbosity.add_option('--get-filename',
3773                         action='store_true', dest='getfilename',
3774                         help='simulate, quiet but print output filename', default=False)
3775         verbosity.add_option('--get-format',
3776                         action='store_true', dest='getformat',
3777                         help='simulate, quiet but print output format', default=False)
3778         verbosity.add_option('--no-progress',
3779                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3780         verbosity.add_option('--console-title',
3781                         action='store_true', dest='consoletitle',
3782                         help='display progress in console titlebar', default=False)
3783
3784
3785         filesystem.add_option('-t', '--title',
3786                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3787         filesystem.add_option('-l', '--literal',
3788                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3789         filesystem.add_option('-A', '--auto-number',
3790                         action='store_true', dest='autonumber',
3791                         help='number downloaded files starting from 00000', default=False)
3792         filesystem.add_option('-o', '--output',
3793                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3794         filesystem.add_option('-a', '--batch-file',
3795                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3796         filesystem.add_option('-w', '--no-overwrites',
3797                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3798         filesystem.add_option('-c', '--continue',
3799                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3800         filesystem.add_option('--no-continue',
3801                         action='store_false', dest='continue_dl',
3802                         help='do not resume partially downloaded files (restart from beginning)')
3803         filesystem.add_option('--cookies',
3804                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3805         filesystem.add_option('--no-part',
3806                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3807         filesystem.add_option('--no-mtime',
3808                         action='store_false', dest='updatetime',
3809                         help='do not use the Last-modified header to set the file modification time', default=True)
3810         filesystem.add_option('--write-description',
3811                         action='store_true', dest='writedescription',
3812                         help='write video description to a .description file', default=False)
3813         filesystem.add_option('--write-info-json',
3814                         action='store_true', dest='writeinfojson',
3815                         help='write video metadata to a .info.json file', default=False)
3816
3817
3818         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3819                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3820         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3821                         help='"best", "aac", "vorbis" or "mp3"; best by default')
3822         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3823                         help='ffmpeg audio bitrate specification, 128k by default')
3824         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3825                         help='keeps the video file on disk after the post-processing; the video is erased by default')
3826
3827
3828         parser.add_option_group(general)
3829         parser.add_option_group(selection)
3830         parser.add_option_group(filesystem)
3831         parser.add_option_group(verbosity)
3832         parser.add_option_group(video_format)
3833         parser.add_option_group(authentication)
3834         parser.add_option_group(postproc)
3835
3836         opts, args = parser.parse_args()
3837
3838         return parser, opts, args
3839
3840 def gen_extractors():
3841         """ Return a list of an instance of every supported extractor.
3842         The order does matter; the first extractor matched is the one handling the URL.
3843         """
3844         youtube_ie = YoutubeIE()
3845         google_ie = GoogleIE()
3846         yahoo_ie = YahooIE()
3847         return [
3848                 YoutubePlaylistIE(youtube_ie),
3849                 YoutubeUserIE(youtube_ie),
3850                 YoutubeSearchIE(youtube_ie),
3851                 youtube_ie,
3852                 MetacafeIE(youtube_ie),
3853                 DailymotionIE(),
3854                 google_ie,
3855                 GoogleSearchIE(google_ie),
3856                 PhotobucketIE(),
3857                 yahoo_ie,
3858                 YahooSearchIE(yahoo_ie),
3859                 DepositFilesIE(),
3860                 FacebookIE(),
3861                 BlipTVIE(),
3862                 VimeoIE(),
3863                 MyVideoIE(),
3864                 ComedyCentralIE(),
3865                 EscapistIE(),
3866                 CollegeHumorIE(),
3867                 XVideosIE(),
3868
3869                 GenericIE()
3870         ]
3871
3872 def main():
3873         parser, opts, args = parseOpts()
3874
3875         # Open appropriate CookieJar
3876         if opts.cookiefile is None:
3877                 jar = cookielib.CookieJar()
3878         else:
3879                 try:
3880                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
3881                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3882                                 jar.load()
3883                 except (IOError, OSError), err:
3884                         sys.exit(u'ERROR: unable to open cookie file')
3885
3886         # Dump user agent
3887         if opts.dump_user_agent:
3888                 print std_headers['User-Agent']
3889                 sys.exit(0)
3890
3891         # Batch file verification
3892         batchurls = []
3893         if opts.batchfile is not None:
3894                 try:
3895                         if opts.batchfile == '-':
3896                                 batchfd = sys.stdin
3897                         else:
3898                                 batchfd = open(opts.batchfile, 'r')
3899                         batchurls = batchfd.readlines()
3900                         batchurls = [x.strip() for x in batchurls]
3901                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3902                 except IOError:
3903                         sys.exit(u'ERROR: batch file could not be read')
3904         all_urls = batchurls + args
3905
3906         # General configuration
3907         cookie_processor = urllib2.HTTPCookieProcessor(jar)
3908         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3909         urllib2.install_opener(opener)
3910         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3911
3912         extractors = gen_extractors()
3913
3914         if opts.list_extractors:
3915                 for ie in extractors:
3916                         print(ie.IE_NAME)
3917                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3918                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3919                         for mu in matchedUrls:
3920                                 print(u'  ' + mu)
3921                 sys.exit(0)
3922
3923         # Conflicting, missing and erroneous options
3924         if opts.usenetrc and (opts.username is not None or opts.password is not None):
3925                 parser.error(u'using .netrc conflicts with giving username/password')
3926         if opts.password is not None and opts.username is None:
3927                 parser.error(u'account username missing')
3928         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3929                 parser.error(u'using output template conflicts with using title, literal title or auto number')
3930         if opts.usetitle and opts.useliteral:
3931                 parser.error(u'using title conflicts with using literal title')
3932         if opts.username is not None and opts.password is None:
3933                 opts.password = getpass.getpass(u'Type account password and press return:')
3934         if opts.ratelimit is not None:
3935                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3936                 if numeric_limit is None:
3937                         parser.error(u'invalid rate limit specified')
3938                 opts.ratelimit = numeric_limit
3939         if opts.retries is not None:
3940                 try:
3941                         opts.retries = long(opts.retries)
3942                 except (TypeError, ValueError), err:
3943                         parser.error(u'invalid retry count specified')
3944         try:
3945                 opts.playliststart = int(opts.playliststart)
3946                 if opts.playliststart <= 0:
3947                         raise ValueError(u'Playlist start must be positive')
3948         except (TypeError, ValueError), err:
3949                 parser.error(u'invalid playlist start number specified')
3950         try:
3951                 opts.playlistend = int(opts.playlistend)
3952                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3953                         raise ValueError(u'Playlist end must be greater than playlist start')
3954         except (TypeError, ValueError), err:
3955                 parser.error(u'invalid playlist end number specified')
3956         if opts.extractaudio:
3957                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
3958                         parser.error(u'invalid audio format specified')
3959
3960         # File downloader
3961         fd = FileDownloader({
3962                 'usenetrc': opts.usenetrc,
3963                 'username': opts.username,
3964                 'password': opts.password,
3965                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3966                 'forceurl': opts.geturl,
3967                 'forcetitle': opts.gettitle,
3968                 'forcethumbnail': opts.getthumbnail,
3969                 'forcedescription': opts.getdescription,
3970                 'forcefilename': opts.getfilename,
3971                 'forceformat': opts.getformat,
3972                 'simulate': opts.simulate,
3973                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3974                 'format': opts.format,
3975                 'format_limit': opts.format_limit,
3976                 'listformats': opts.listformats,
3977                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3978                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3979                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3980                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3981                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3982                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3983                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3984                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3985                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3986                         or u'%(id)s.%(ext)s'),
3987                 'ignoreerrors': opts.ignoreerrors,
3988                 'ratelimit': opts.ratelimit,
3989                 'nooverwrites': opts.nooverwrites,
3990                 'retries': opts.retries,
3991                 'continuedl': opts.continue_dl,
3992                 'noprogress': opts.noprogress,
3993                 'playliststart': opts.playliststart,
3994                 'playlistend': opts.playlistend,
3995                 'logtostderr': opts.outtmpl == '-',
3996                 'consoletitle': opts.consoletitle,
3997                 'nopart': opts.nopart,
3998                 'updatetime': opts.updatetime,
3999                 'writedescription': opts.writedescription,
4000                 'writeinfojson': opts.writeinfojson,
4001                 'matchtitle': opts.matchtitle,
4002                 'rejecttitle': opts.rejecttitle,
4003                 })
4004         for extractor in extractors:
4005                 fd.add_info_extractor(extractor)
4006
4007         # PostProcessors
4008         if opts.extractaudio:
4009                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4010
4011         # Update version
4012         if opts.update_self:
4013                 updateSelf(fd, sys.argv[0])
4014
4015         # Maybe do nothing
4016         if len(all_urls) < 1:
4017                 if not opts.update_self:
4018                         parser.error(u'you must provide at least one URL')
4019                 else:
4020                         sys.exit()
4021         retcode = fd.download(all_urls)
4022
4023         # Dump cookie jar if requested
4024         if opts.cookiefile is not None:
4025                 try:
4026                         jar.save()
4027                 except (IOError, OSError), err:
4028                         sys.exit(u'ERROR: unable to save cookie jar')
4029
4030         sys.exit(retcode)
4031
4032
4033 if __name__ == '__main__':
4034         try:
4035                 main()
4036         except DownloadError:
4037                 sys.exit(1)
4038         except SameFileError:
4039                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4040         except KeyboardInterrupt:
4041                 sys.exit(u'\nERROR: Interrupted by user')
4042
4043 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: