OpenClassRoom IE (Closes: #234)
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2011.11.23'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48         import ctypes
49
50 try:
51         import email.utils
52 except ImportError: # Python 2.4
53         import email.Utils
54 try:
55         import cStringIO as StringIO
56 except ImportError:
57         import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61         from urlparse import parse_qs
62 except ImportError:
63         from cgi import parse_qs
64
65 try:
66         import lxml.etree
67 except ImportError:
68         pass # Handled below
69
70 try:
71         import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79         'Accept-Encoding': 'gzip, deflate',
80         'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84         import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86         import re
87         class json(object):
88                 @staticmethod
89                 def loads(s):
90                         s = s.decode('UTF-8')
91                         def raiseError(msg, i):
92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93                         def skipSpace(i, expectMore=True):
94                                 while i < len(s) and s[i] in ' \t\r\n':
95                                         i += 1
96                                 if expectMore:
97                                         if i >= len(s):
98                                                 raiseError('Premature end', i)
99                                 return i
100                         def decodeEscape(match):
101                                 esc = match.group(1)
102                                 _STATIC = {
103                                         '"': '"',
104                                         '\\': '\\',
105                                         '/': '/',
106                                         'b': unichr(0x8),
107                                         'f': unichr(0xc),
108                                         'n': '\n',
109                                         'r': '\r',
110                                         't': '\t',
111                                 }
112                                 if esc in _STATIC:
113                                         return _STATIC[esc]
114                                 if esc[0] == 'u':
115                                         if len(esc) == 1+4:
116                                                 return unichr(int(esc[1:5], 16))
117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
118                                                 hi = int(esc[1:5], 16)
119                                                 low = int(esc[7:11], 16)
120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121                                 raise ValueError('Unknown escape ' + str(esc))
122                         def parseString(i):
123                                 i += 1
124                                 e = i
125                                 while True:
126                                         e = s.index('"', e)
127                                         bslashes = 0
128                                         while s[e-bslashes-1] == '\\':
129                                                 bslashes += 1
130                                         if bslashes % 2 == 1:
131                                                 e += 1
132                                                 continue
133                                         break
134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135                                 stri = rexp.sub(decodeEscape, s[i:e])
136                                 return (e+1,stri)
137                         def parseObj(i):
138                                 i += 1
139                                 res = {}
140                                 i = skipSpace(i)
141                                 if s[i] == '}': # Empty dictionary
142                                         return (i+1,res)
143                                 while True:
144                                         if s[i] != '"':
145                                                 raiseError('Expected a string object key', i)
146                                         i,key = parseString(i)
147                                         i = skipSpace(i)
148                                         if i >= len(s) or s[i] != ':':
149                                                 raiseError('Expected a colon', i)
150                                         i,val = parse(i+1)
151                                         res[key] = val
152                                         i = skipSpace(i)
153                                         if s[i] == '}':
154                                                 return (i+1, res)
155                                         if s[i] != ',':
156                                                 raiseError('Expected comma or closing curly brace', i)
157                                         i = skipSpace(i+1)
158                         def parseArray(i):
159                                 res = []
160                                 i = skipSpace(i+1)
161                                 if s[i] == ']': # Empty array
162                                         return (i+1,res)
163                                 while True:
164                                         i,val = parse(i)
165                                         res.append(val)
166                                         i = skipSpace(i) # Raise exception if premature end
167                                         if s[i] == ']':
168                                                 return (i+1, res)
169                                         if s[i] != ',':
170                                                 raiseError('Expected a comma or closing bracket', i)
171                                         i = skipSpace(i+1)
172                         def parseDiscrete(i):
173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
174                                         if s.startswith(k, i):
175                                                 return (i+len(k), v)
176                                 raiseError('Not a boolean (or null)', i)
177                         def parseNumber(i):
178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179                                 if mobj is None:
180                                         raiseError('Not a number', i)
181                                 nums = mobj.group(1)
182                                 if '.' in nums or 'e' in nums or 'E' in nums:
183                                         return (i+len(nums), float(nums))
184                                 return (i+len(nums), int(nums))
185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186                         def parse(i):
187                                 i = skipSpace(i)
188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
189                                 i = skipSpace(i, False)
190                                 return (i,res)
191                         i,res = parse(0)
192                         if i < len(s):
193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194                         return res
195
196 def preferredencoding():
197         """Get preferred encoding.
198
199         Returns the best encoding scheme for the system, based on
200         locale.getpreferredencoding() and some further tweaks.
201         """
202         def yield_preferredencoding():
203                 try:
204                         pref = locale.getpreferredencoding()
205                         u'TEST'.encode(pref)
206                 except:
207                         pref = 'UTF-8'
208                 while True:
209                         yield pref
210         return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214         """Transforms an HTML entity to a Unicode character.
215
216         This function receives a match object and is intended to be used with
217         the re.sub() function.
218         """
219         entity = matchobj.group(1)
220
221         # Known non-numeric HTML entity
222         if entity in htmlentitydefs.name2codepoint:
223                 return unichr(htmlentitydefs.name2codepoint[entity])
224
225         # Unicode character
226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
227         if mobj is not None:
228                 numstr = mobj.group(1)
229                 if numstr.startswith(u'x'):
230                         base = 16
231                         numstr = u'0%s' % numstr
232                 else:
233                         base = 10
234                 return unichr(long(numstr, base))
235
236         # Unknown entity in name, return its literal representation
237         return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241         """Sanitizes a video title so it could be used as part of a filename."""
242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243         return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247         """Try to open the given filename, and slightly tweak it if this fails.
248
249         Attempts to open the given filename. If this fails, it tries to change
250         the filename slightly, step by step, until it's either able to open it
251         or it fails and raises a final exception, like the standard open()
252         function.
253
254         It returns the tuple (stream, definitive_file_name).
255         """
256         try:
257                 if filename == u'-':
258                         if sys.platform == 'win32':
259                                 import msvcrt
260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261                         return (sys.stdout, filename)
262                 stream = open(filename, open_mode)
263                 return (stream, filename)
264         except (IOError, OSError), err:
265                 # In case of error, try to remove win32 forbidden chars
266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268                 # An exception here should be caught in the caller
269                 stream = open(filename, open_mode)
270                 return (stream, filename)
271
272
273 def timeconvert(timestr):
274         """Convert RFC 2822 defined time string into system timestamp"""
275         timestamp = None
276         timetuple = email.utils.parsedate_tz(timestr)
277         if timetuple is not None:
278                 timestamp = email.utils.mktime_tz(timetuple)
279         return timestamp
280
281 def _simplify_title(title):
282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283         return expr.sub(u'_', title).strip(u'_')
284
285 class DownloadError(Exception):
286         """Download Error exception.
287
288         This exception may be thrown by FileDownloader objects if they are not
289         configured to continue on errors. They will contain the appropriate
290         error message.
291         """
292         pass
293
294
295 class SameFileError(Exception):
296         """Same File exception.
297
298         This exception will be thrown by FileDownloader objects if they detect
299         multiple files would have to be downloaded to the same file on disk.
300         """
301         pass
302
303
304 class PostProcessingError(Exception):
305         """Post Processing exception.
306
307         This exception may be raised by PostProcessor's .run() method to
308         indicate an error in the postprocessing task.
309         """
310         pass
311
312
313 class UnavailableVideoError(Exception):
314         """Unavailable Format exception.
315
316         This exception will be thrown when a video is requested
317         in a format that is not available for that video.
318         """
319         pass
320
321
322 class ContentTooShortError(Exception):
323         """Content Too Short exception.
324
325         This exception may be raised by FileDownloader objects when a file they
326         download is too small for what the server announced first, indicating
327         the connection was probably interrupted.
328         """
329         # Both in bytes
330         downloaded = None
331         expected = None
332
333         def __init__(self, downloaded, expected):
334                 self.downloaded = downloaded
335                 self.expected = expected
336
337
338 class YoutubeDLHandler(urllib2.HTTPHandler):
339         """Handler for HTTP requests and responses.
340
341         This class, when installed with an OpenerDirector, automatically adds
342         the standard headers to every HTTP request and handles gzipped and
343         deflated responses from web servers. If compression is to be avoided in
344         a particular request, the original request in the program code only has
345         to include the HTTP header "Youtubedl-No-Compression", which will be
346         removed before making the real request.
347
348         Part of this code was copied from:
349
350         http://techknack.net/python-urllib2-handlers/
351
352         Andrew Rowls, the author of that code, agreed to release it to the
353         public domain.
354         """
355
356         @staticmethod
357         def deflate(data):
358                 try:
359                         return zlib.decompress(data, -zlib.MAX_WBITS)
360                 except zlib.error:
361                         return zlib.decompress(data)
362
363         @staticmethod
364         def addinfourl_wrapper(stream, headers, url, code):
365                 if hasattr(urllib2.addinfourl, 'getcode'):
366                         return urllib2.addinfourl(stream, headers, url, code)
367                 ret = urllib2.addinfourl(stream, headers, url)
368                 ret.code = code
369                 return ret
370
371         def http_request(self, req):
372                 for h in std_headers:
373                         if h in req.headers:
374                                 del req.headers[h]
375                         req.add_header(h, std_headers[h])
376                 if 'Youtubedl-no-compression' in req.headers:
377                         if 'Accept-encoding' in req.headers:
378                                 del req.headers['Accept-encoding']
379                         del req.headers['Youtubedl-no-compression']
380                 return req
381
382         def http_response(self, req, resp):
383                 old_resp = resp
384                 # gzip
385                 if resp.headers.get('Content-encoding', '') == 'gzip':
386                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
387                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
388                         resp.msg = old_resp.msg
389                 # deflate
390                 if resp.headers.get('Content-encoding', '') == 'deflate':
391                         gz = StringIO.StringIO(self.deflate(resp.read()))
392                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
393                         resp.msg = old_resp.msg
394                 return resp
395
396
397 class FileDownloader(object):
398         """File Downloader class.
399
400         File downloader objects are the ones responsible of downloading the
401         actual video file and writing it to disk if the user has requested
402         it, among some other tasks. In most cases there should be one per
403         program. As, given a video URL, the downloader doesn't know how to
404         extract all the needed information, task that InfoExtractors do, it
405         has to pass the URL to one of them.
406
407         For this, file downloader objects have a method that allows
408         InfoExtractors to be registered in a given order. When it is passed
409         a URL, the file downloader handles it to the first InfoExtractor it
410         finds that reports being able to handle it. The InfoExtractor extracts
411         all the information about the video or videos the URL refers to, and
412         asks the FileDownloader to process the video information, possibly
413         downloading the video.
414
415         File downloaders accept a lot of parameters. In order not to saturate
416         the object constructor with arguments, it receives a dictionary of
417         options instead. These options are available through the params
418         attribute for the InfoExtractors to use. The FileDownloader also
419         registers itself as the downloader in charge for the InfoExtractors
420         that are added to it, so this is a "mutual registration".
421
422         Available options:
423
424         username:         Username for authentication purposes.
425         password:         Password for authentication purposes.
426         usenetrc:         Use netrc for authentication instead.
427         quiet:            Do not print messages to stdout.
428         forceurl:         Force printing final URL.
429         forcetitle:       Force printing title.
430         forcethumbnail:   Force printing thumbnail URL.
431         forcedescription: Force printing description.
432         forcefilename:    Force printing final filename.
433         simulate:         Do not download the video files.
434         format:           Video format code.
435         format_limit:     Highest quality format to try.
436         outtmpl:          Template for output names.
437         ignoreerrors:     Do not stop on download errors.
438         ratelimit:        Download speed limit, in bytes/sec.
439         nooverwrites:     Prevent overwriting files.
440         retries:          Number of times to retry for HTTP error 5xx
441         continuedl:       Try to continue downloads if possible.
442         noprogress:       Do not print the progress bar.
443         playliststart:    Playlist item to start at.
444         playlistend:      Playlist item to end at.
445         matchtitle:       Download only matching titles.
446         rejecttitle:      Reject downloads for matching titles.
447         logtostderr:      Log messages to stderr instead of stdout.
448         consoletitle:     Display progress in console window's titlebar.
449         nopart:           Do not use temporary .part files.
450         updatetime:       Use the Last-modified header to set output file timestamps.
451         writedescription: Write the video description to a .description file
452         writeinfojson:    Write the video description to a .info.json file
453         """
454
455         params = None
456         _ies = []
457         _pps = []
458         _download_retcode = None
459         _num_downloads = None
460         _screen_file = None
461
462         def __init__(self, params):
463                 """Create a FileDownloader object with the given options."""
464                 self._ies = []
465                 self._pps = []
466                 self._download_retcode = 0
467                 self._num_downloads = 0
468                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
469                 self.params = params
470
471         @staticmethod
472         def format_bytes(bytes):
473                 if bytes is None:
474                         return 'N/A'
475                 if type(bytes) is str:
476                         bytes = float(bytes)
477                 if bytes == 0.0:
478                         exponent = 0
479                 else:
480                         exponent = long(math.log(bytes, 1024.0))
481                 suffix = 'bkMGTPEZY'[exponent]
482                 converted = float(bytes) / float(1024 ** exponent)
483                 return '%.2f%s' % (converted, suffix)
484
485         @staticmethod
486         def calc_percent(byte_counter, data_len):
487                 if data_len is None:
488                         return '---.-%'
489                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
490
491         @staticmethod
492         def calc_eta(start, now, total, current):
493                 if total is None:
494                         return '--:--'
495                 dif = now - start
496                 if current == 0 or dif < 0.001: # One millisecond
497                         return '--:--'
498                 rate = float(current) / dif
499                 eta = long((float(total) - float(current)) / rate)
500                 (eta_mins, eta_secs) = divmod(eta, 60)
501                 if eta_mins > 99:
502                         return '--:--'
503                 return '%02d:%02d' % (eta_mins, eta_secs)
504
505         @staticmethod
506         def calc_speed(start, now, bytes):
507                 dif = now - start
508                 if bytes == 0 or dif < 0.001: # One millisecond
509                         return '%10s' % '---b/s'
510                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
511
512         @staticmethod
513         def best_block_size(elapsed_time, bytes):
514                 new_min = max(bytes / 2.0, 1.0)
515                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
516                 if elapsed_time < 0.001:
517                         return long(new_max)
518                 rate = bytes / elapsed_time
519                 if rate > new_max:
520                         return long(new_max)
521                 if rate < new_min:
522                         return long(new_min)
523                 return long(rate)
524
525         @staticmethod
526         def parse_bytes(bytestr):
527                 """Parse a string indicating a byte quantity into a long integer."""
528                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
529                 if matchobj is None:
530                         return None
531                 number = float(matchobj.group(1))
532                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
533                 return long(round(number * multiplier))
534
535         def add_info_extractor(self, ie):
536                 """Add an InfoExtractor object to the end of the list."""
537                 self._ies.append(ie)
538                 ie.set_downloader(self)
539
540         def add_post_processor(self, pp):
541                 """Add a PostProcessor object to the end of the chain."""
542                 self._pps.append(pp)
543                 pp.set_downloader(self)
544
545         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
546                 """Print message to stdout if not in quiet mode."""
547                 try:
548                         if not self.params.get('quiet', False):
549                                 terminator = [u'\n', u''][skip_eol]
550                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
551                         self._screen_file.flush()
552                 except (UnicodeEncodeError), err:
553                         if not ignore_encoding_errors:
554                                 raise
555
556         def to_stderr(self, message):
557                 """Print message to stderr."""
558                 print >>sys.stderr, message.encode(preferredencoding())
559
560         def to_cons_title(self, message):
561                 """Set console/terminal window title to message."""
562                 if not self.params.get('consoletitle', False):
563                         return
564                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
565                         # c_wchar_p() might not be necessary if `message` is
566                         # already of type unicode()
567                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
568                 elif 'TERM' in os.environ:
569                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
570
571         def fixed_template(self):
572                 """Checks if the output template is fixed."""
573                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
574
575         def trouble(self, message=None):
576                 """Determine action to take when a download problem appears.
577
578                 Depending on if the downloader has been configured to ignore
579                 download errors or not, this method may throw an exception or
580                 not when errors are found, after printing the message.
581                 """
582                 if message is not None:
583                         self.to_stderr(message)
584                 if not self.params.get('ignoreerrors', False):
585                         raise DownloadError(message)
586                 self._download_retcode = 1
587
588         def slow_down(self, start_time, byte_counter):
589                 """Sleep if the download speed is over the rate limit."""
590                 rate_limit = self.params.get('ratelimit', None)
591                 if rate_limit is None or byte_counter == 0:
592                         return
593                 now = time.time()
594                 elapsed = now - start_time
595                 if elapsed <= 0.0:
596                         return
597                 speed = float(byte_counter) / elapsed
598                 if speed > rate_limit:
599                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
600
601         def temp_name(self, filename):
602                 """Returns a temporary filename for the given filename."""
603                 if self.params.get('nopart', False) or filename == u'-' or \
604                                 (os.path.exists(filename) and not os.path.isfile(filename)):
605                         return filename
606                 return filename + u'.part'
607
608         def undo_temp_name(self, filename):
609                 if filename.endswith(u'.part'):
610                         return filename[:-len(u'.part')]
611                 return filename
612
613         def try_rename(self, old_filename, new_filename):
614                 try:
615                         if old_filename == new_filename:
616                                 return
617                         os.rename(old_filename, new_filename)
618                 except (IOError, OSError), err:
619                         self.trouble(u'ERROR: unable to rename file')
620
621         def try_utime(self, filename, last_modified_hdr):
622                 """Try to set the last-modified time of the given file."""
623                 if last_modified_hdr is None:
624                         return
625                 if not os.path.isfile(filename):
626                         return
627                 timestr = last_modified_hdr
628                 if timestr is None:
629                         return
630                 filetime = timeconvert(timestr)
631                 if filetime is None:
632                         return filetime
633                 try:
634                         os.utime(filename, (time.time(), filetime))
635                 except:
636                         pass
637                 return filetime
638
639         def report_writedescription(self, descfn):
640                 """ Report that the description file is being written """
641                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
642
643         def report_writeinfojson(self, infofn):
644                 """ Report that the metadata file has been written """
645                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
646
647         def report_destination(self, filename):
648                 """Report destination filename."""
649                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
650
651         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
652                 """Report download progress."""
653                 if self.params.get('noprogress', False):
654                         return
655                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
656                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
657                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
658                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
659
660         def report_resuming_byte(self, resume_len):
661                 """Report attempt to resume at given byte."""
662                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
663
664         def report_retry(self, count, retries):
665                 """Report retry in case of HTTP error 5xx"""
666                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
667
668         def report_file_already_downloaded(self, file_name):
669                 """Report file has already been fully downloaded."""
670                 try:
671                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
672                 except (UnicodeEncodeError), err:
673                         self.to_screen(u'[download] The file has already been downloaded')
674
675         def report_unable_to_resume(self):
676                 """Report it was impossible to resume download."""
677                 self.to_screen(u'[download] Unable to resume')
678
679         def report_finish(self):
680                 """Report download finished."""
681                 if self.params.get('noprogress', False):
682                         self.to_screen(u'[download] Download completed')
683                 else:
684                         self.to_screen(u'')
685
686         def increment_downloads(self):
687                 """Increment the ordinal that assigns a number to each file."""
688                 self._num_downloads += 1
689
690         def prepare_filename(self, info_dict):
691                 """Generate the output filename."""
692                 try:
693                         template_dict = dict(info_dict)
694                         template_dict['epoch'] = unicode(long(time.time()))
695                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
696                         filename = self.params['outtmpl'] % template_dict
697                         return filename
698                 except (ValueError, KeyError), err:
699                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
700                         return None
701
702         def _match_entry(self, info_dict):
703                 """ Returns None iff the file should be downloaded """
704
705                 title = info_dict['title']
706                 matchtitle = self.params.get('matchtitle', False)
707                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
708                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
709                 rejecttitle = self.params.get('rejecttitle', False)
710                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
711                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
712                 return None
713
714         def process_info(self, info_dict):
715                 """Process a single dictionary returned by an InfoExtractor."""
716
717                 reason = self._match_entry(info_dict)
718                 if reason is not None:
719                         self.to_screen(u'[download] ' + reason)
720                         return
721
722                 max_downloads = self.params.get('max_downloads')
723                 if max_downloads is not None:
724                         if self._num_downloads > int(max_downloads):
725                                 self.to_screen(u'[download] Maximum number of downloads reached. Skipping ' + info_dict['title'])
726                                 return
727
728                 filename = self.prepare_filename(info_dict)
729                 
730                 # Forced printings
731                 if self.params.get('forcetitle', False):
732                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
733                 if self.params.get('forceurl', False):
734                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
735                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
736                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
737                 if self.params.get('forcedescription', False) and 'description' in info_dict:
738                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
739                 if self.params.get('forcefilename', False) and filename is not None:
740                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
741                 if self.params.get('forceformat', False):
742                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
743
744                 # Do nothing else if in simulate mode
745                 if self.params.get('simulate', False):
746                         return
747
748                 if filename is None:
749                         return
750
751                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
752                         self.to_stderr(u'WARNING: file exists and will be skipped')
753                         return
754
755                 try:
756                         dn = os.path.dirname(filename)
757                         if dn != '' and not os.path.exists(dn):
758                                 os.makedirs(dn)
759                 except (OSError, IOError), err:
760                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
761                         return
762
763                 if self.params.get('writedescription', False):
764                         try:
765                                 descfn = filename + '.description'
766                                 self.report_writedescription(descfn)
767                                 descfile = open(descfn, 'wb')
768                                 try:
769                                         descfile.write(info_dict['description'].encode('utf-8'))
770                                 finally:
771                                         descfile.close()
772                         except (OSError, IOError):
773                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
774                                 return
775
776                 if self.params.get('writeinfojson', False):
777                         infofn = filename + '.info.json'
778                         self.report_writeinfojson(infofn)
779                         try:
780                                 json.dump
781                         except (NameError,AttributeError):
782                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
783                                 return
784                         try:
785                                 infof = open(infofn, 'wb')
786                                 try:
787                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
788                                         json.dump(json_info_dict, infof)
789                                 finally:
790                                         infof.close()
791                         except (OSError, IOError):
792                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
793                                 return
794
795                 if not self.params.get('skip_download', False):
796                         try:
797                                 success = self._do_download(filename, info_dict)
798                         except (OSError, IOError), err:
799                                 raise UnavailableVideoError
800                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
801                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
802                                 return
803                         except (ContentTooShortError, ), err:
804                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
805                                 return
806         
807                         if success:
808                                 try:
809                                         self.post_process(filename, info_dict)
810                                 except (PostProcessingError), err:
811                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
812                                         return
813
814         def download(self, url_list):
815                 """Download a given list of URLs."""
816                 if len(url_list) > 1 and self.fixed_template():
817                         raise SameFileError(self.params['outtmpl'])
818
819                 for url in url_list:
820                         suitable_found = False
821                         for ie in self._ies:
822                                 # Go to next InfoExtractor if not suitable
823                                 if not ie.suitable(url):
824                                         continue
825
826                                 # Suitable InfoExtractor found
827                                 suitable_found = True
828
829                                 # Extract information from URL and process it
830                                 ie.extract(url)
831
832                                 # Suitable InfoExtractor had been found; go to next URL
833                                 break
834
835                         if not suitable_found:
836                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
837
838                 return self._download_retcode
839
840         def post_process(self, filename, ie_info):
841                 """Run the postprocessing chain on the given file."""
842                 info = dict(ie_info)
843                 info['filepath'] = filename
844                 for pp in self._pps:
845                         info = pp.run(info)
846                         if info is None:
847                                 break
848
849         def _download_with_rtmpdump(self, filename, url, player_url):
850                 self.report_destination(filename)
851                 tmpfilename = self.temp_name(filename)
852
853                 # Check for rtmpdump first
854                 try:
855                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
856                 except (OSError, IOError):
857                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
858                         return False
859
860                 # Download using rtmpdump. rtmpdump returns exit code 2 when
861                 # the connection was interrumpted and resuming appears to be
862                 # possible. This is part of rtmpdump's normal usage, AFAIK.
863                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
864                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
865                 while retval == 2 or retval == 1:
866                         prevsize = os.path.getsize(tmpfilename)
867                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
868                         time.sleep(5.0) # This seems to be needed
869                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
870                         cursize = os.path.getsize(tmpfilename)
871                         if prevsize == cursize and retval == 1:
872                                 break
873                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
874                         if prevsize == cursize and retval == 2 and cursize > 1024:
875                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
876                                 retval = 0
877                                 break
878                 if retval == 0:
879                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
880                         self.try_rename(tmpfilename, filename)
881                         return True
882                 else:
883                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
884                         return False
885
886         def _do_download(self, filename, info_dict):
887                 url = info_dict['url']
888                 player_url = info_dict.get('player_url', None)
889
890                 # Check file already present
891                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
892                         self.report_file_already_downloaded(filename)
893                         return True
894
895                 # Attempt to download using rtmpdump
896                 if url.startswith('rtmp'):
897                         return self._download_with_rtmpdump(filename, url, player_url)
898
899                 tmpfilename = self.temp_name(filename)
900                 stream = None
901
902                 # Do not include the Accept-Encoding header
903                 headers = {'Youtubedl-no-compression': 'True'}
904                 basic_request = urllib2.Request(url, None, headers)
905                 request = urllib2.Request(url, None, headers)
906
907                 # Establish possible resume length
908                 if os.path.isfile(tmpfilename):
909                         resume_len = os.path.getsize(tmpfilename)
910                 else:
911                         resume_len = 0
912
913                 open_mode = 'wb'
914                 if resume_len != 0:
915                         if self.params.get('continuedl', False):
916                                 self.report_resuming_byte(resume_len)
917                                 request.add_header('Range','bytes=%d-' % resume_len)
918                                 open_mode = 'ab'
919                         else:
920                                 resume_len = 0
921
922                 count = 0
923                 retries = self.params.get('retries', 0)
924                 while count <= retries:
925                         # Establish connection
926                         try:
927                                 if count == 0 and 'urlhandle' in info_dict:
928                                         data = info_dict['urlhandle']
929                                 data = urllib2.urlopen(request)
930                                 break
931                         except (urllib2.HTTPError, ), err:
932                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
933                                         # Unexpected HTTP error
934                                         raise
935                                 elif err.code == 416:
936                                         # Unable to resume (requested range not satisfiable)
937                                         try:
938                                                 # Open the connection again without the range header
939                                                 data = urllib2.urlopen(basic_request)
940                                                 content_length = data.info()['Content-Length']
941                                         except (urllib2.HTTPError, ), err:
942                                                 if err.code < 500 or err.code >= 600:
943                                                         raise
944                                         else:
945                                                 # Examine the reported length
946                                                 if (content_length is not None and
947                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
948                                                         # The file had already been fully downloaded.
949                                                         # Explanation to the above condition: in issue #175 it was revealed that
950                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
951                                                         # changing the file size slightly and causing problems for some users. So
952                                                         # I decided to implement a suggested change and consider the file
953                                                         # completely downloaded if the file size differs less than 100 bytes from
954                                                         # the one in the hard drive.
955                                                         self.report_file_already_downloaded(filename)
956                                                         self.try_rename(tmpfilename, filename)
957                                                         return True
958                                                 else:
959                                                         # The length does not match, we start the download over
960                                                         self.report_unable_to_resume()
961                                                         open_mode = 'wb'
962                                                         break
963                         # Retry
964                         count += 1
965                         if count <= retries:
966                                 self.report_retry(count, retries)
967
968                 if count > retries:
969                         self.trouble(u'ERROR: giving up after %s retries' % retries)
970                         return False
971
972                 data_len = data.info().get('Content-length', None)
973                 if data_len is not None:
974                         data_len = long(data_len) + resume_len
975                 data_len_str = self.format_bytes(data_len)
976                 byte_counter = 0 + resume_len
977                 block_size = 1024
978                 start = time.time()
979                 while True:
980                         # Download and write
981                         before = time.time()
982                         data_block = data.read(block_size)
983                         after = time.time()
984                         if len(data_block) == 0:
985                                 break
986                         byte_counter += len(data_block)
987
988                         # Open file just in time
989                         if stream is None:
990                                 try:
991                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
992                                         assert stream is not None
993                                         filename = self.undo_temp_name(tmpfilename)
994                                         self.report_destination(filename)
995                                 except (OSError, IOError), err:
996                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
997                                         return False
998                         try:
999                                 stream.write(data_block)
1000                         except (IOError, OSError), err:
1001                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1002                                 return False
1003                         block_size = self.best_block_size(after - before, len(data_block))
1004
1005                         # Progress message
1006                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1007                         if data_len is None:
1008                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1009                         else:
1010                                 percent_str = self.calc_percent(byte_counter, data_len)
1011                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1012                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1013
1014                         # Apply rate limit
1015                         self.slow_down(start, byte_counter - resume_len)
1016
1017                 if stream is None:
1018                         self.trouble(u'\nERROR: Did not get any data blocks')
1019                         return False
1020                 stream.close()
1021                 self.report_finish()
1022                 if data_len is not None and byte_counter != data_len:
1023                         raise ContentTooShortError(byte_counter, long(data_len))
1024                 self.try_rename(tmpfilename, filename)
1025
1026                 # Update file modification time
1027                 if self.params.get('updatetime', True):
1028                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1029
1030                 return True
1031
1032
1033 class InfoExtractor(object):
1034         """Information Extractor class.
1035
1036         Information extractors are the classes that, given a URL, extract
1037         information from the video (or videos) the URL refers to. This
1038         information includes the real video URL, the video title and simplified
1039         title, author and others. The information is stored in a dictionary
1040         which is then passed to the FileDownloader. The FileDownloader
1041         processes this information possibly downloading the video to the file
1042         system, among other possible outcomes. The dictionaries must include
1043         the following fields:
1044
1045         id:             Video identifier.
1046         url:            Final video URL.
1047         uploader:       Nickname of the video uploader.
1048         title:          Literal title.
1049         stitle:         Simplified title.
1050         ext:            Video filename extension.
1051         format:         Video format.
1052         player_url:     SWF Player URL (may be None).
1053
1054         The following fields are optional. Their primary purpose is to allow
1055         youtube-dl to serve as the backend for a video search function, such
1056         as the one in youtube2mp3.  They are only used when their respective
1057         forced printing functions are called:
1058
1059         thumbnail:      Full URL to a video thumbnail image.
1060         description:    One-line video description.
1061
1062         Subclasses of this one should re-define the _real_initialize() and
1063         _real_extract() methods and define a _VALID_URL regexp.
1064         Probably, they should also be added to the list of extractors.
1065         """
1066
1067         _ready = False
1068         _downloader = None
1069
1070         def __init__(self, downloader=None):
1071                 """Constructor. Receives an optional downloader."""
1072                 self._ready = False
1073                 self.set_downloader(downloader)
1074
1075         def suitable(self, url):
1076                 """Receives a URL and returns True if suitable for this IE."""
1077                 return re.match(self._VALID_URL, url) is not None
1078
1079         def initialize(self):
1080                 """Initializes an instance (authentication, etc)."""
1081                 if not self._ready:
1082                         self._real_initialize()
1083                         self._ready = True
1084
1085         def extract(self, url):
1086                 """Extracts URL information and returns it in list of dicts."""
1087                 self.initialize()
1088                 return self._real_extract(url)
1089
1090         def set_downloader(self, downloader):
1091                 """Sets the downloader for this IE."""
1092                 self._downloader = downloader
1093
1094         def _real_initialize(self):
1095                 """Real initialization process. Redefine in subclasses."""
1096                 pass
1097
1098         def _real_extract(self, url):
1099                 """Real extraction process. Redefine in subclasses."""
1100                 pass
1101
1102
1103 class YoutubeIE(InfoExtractor):
1104         """Information extractor for youtube.com."""
1105
1106         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1107         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1108         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1109         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1110         _NETRC_MACHINE = 'youtube'
1111         # Listed in order of quality
1112         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1113         _video_extensions = {
1114                 '13': '3gp',
1115                 '17': 'mp4',
1116                 '18': 'mp4',
1117                 '22': 'mp4',
1118                 '37': 'mp4',
1119                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1120                 '43': 'webm',
1121                 '44': 'webm',
1122                 '45': 'webm',
1123         }
1124         _video_dimensions = {
1125                 '5': '240x400',
1126                 '6': '???',
1127                 '13': '???',
1128                 '17': '144x176',
1129                 '18': '360x640',
1130                 '22': '720x1280',
1131                 '34': '360x640',
1132                 '35': '480x854',
1133                 '37': '1080x1920',
1134                 '38': '3072x4096',
1135                 '43': '360x640',
1136                 '44': '480x854',
1137                 '45': '720x1280',
1138         }       
1139         IE_NAME = u'youtube'
1140
1141         def report_lang(self):
1142                 """Report attempt to set language."""
1143                 self._downloader.to_screen(u'[youtube] Setting language')
1144
1145         def report_login(self):
1146                 """Report attempt to log in."""
1147                 self._downloader.to_screen(u'[youtube] Logging in')
1148
1149         def report_age_confirmation(self):
1150                 """Report attempt to confirm age."""
1151                 self._downloader.to_screen(u'[youtube] Confirming age')
1152
1153         def report_video_webpage_download(self, video_id):
1154                 """Report attempt to download video webpage."""
1155                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1156
1157         def report_video_info_webpage_download(self, video_id):
1158                 """Report attempt to download video info webpage."""
1159                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1160
1161         def report_information_extraction(self, video_id):
1162                 """Report attempt to extract video information."""
1163                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1164
1165         def report_unavailable_format(self, video_id, format):
1166                 """Report extracted video URL."""
1167                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1168
1169         def report_rtmp_download(self):
1170                 """Indicate the download will use the RTMP protocol."""
1171                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1172
1173         def _print_formats(self, formats):
1174                 print 'Available formats:'
1175                 for x in formats:
1176                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1177
1178         def _real_initialize(self):
1179                 if self._downloader is None:
1180                         return
1181
1182                 username = None
1183                 password = None
1184                 downloader_params = self._downloader.params
1185
1186                 # Attempt to use provided username and password or .netrc data
1187                 if downloader_params.get('username', None) is not None:
1188                         username = downloader_params['username']
1189                         password = downloader_params['password']
1190                 elif downloader_params.get('usenetrc', False):
1191                         try:
1192                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1193                                 if info is not None:
1194                                         username = info[0]
1195                                         password = info[2]
1196                                 else:
1197                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1198                         except (IOError, netrc.NetrcParseError), err:
1199                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1200                                 return
1201
1202                 # Set language
1203                 request = urllib2.Request(self._LANG_URL)
1204                 try:
1205                         self.report_lang()
1206                         urllib2.urlopen(request).read()
1207                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1208                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1209                         return
1210
1211                 # No authentication to be performed
1212                 if username is None:
1213                         return
1214
1215                 # Log in
1216                 login_form = {
1217                                 'current_form': 'loginForm',
1218                                 'next':         '/',
1219                                 'action_login': 'Log In',
1220                                 'username':     username,
1221                                 'password':     password,
1222                                 }
1223                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1224                 try:
1225                         self.report_login()
1226                         login_results = urllib2.urlopen(request).read()
1227                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1228                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1229                                 return
1230                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1231                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1232                         return
1233
1234                 # Confirm age
1235                 age_form = {
1236                                 'next_url':             '/',
1237                                 'action_confirm':       'Confirm',
1238                                 }
1239                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1240                 try:
1241                         self.report_age_confirmation()
1242                         age_results = urllib2.urlopen(request).read()
1243                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1244                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1245                         return
1246
1247         def _real_extract(self, url):
1248                 # Extract video id from URL
1249                 mobj = re.match(self._VALID_URL, url)
1250                 if mobj is None:
1251                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1252                         return
1253                 video_id = mobj.group(2)
1254
1255                 # Get video webpage
1256                 self.report_video_webpage_download(video_id)
1257                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1258                 try:
1259                         video_webpage = urllib2.urlopen(request).read()
1260                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1261                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1262                         return
1263
1264                 # Attempt to extract SWF player URL
1265                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1266                 if mobj is not None:
1267                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1268                 else:
1269                         player_url = None
1270
1271                 # Get video info
1272                 self.report_video_info_webpage_download(video_id)
1273                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1274                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1275                                         % (video_id, el_type))
1276                         request = urllib2.Request(video_info_url)
1277                         try:
1278                                 video_info_webpage = urllib2.urlopen(request).read()
1279                                 video_info = parse_qs(video_info_webpage)
1280                                 if 'token' in video_info:
1281                                         break
1282                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1284                                 return
1285                 if 'token' not in video_info:
1286                         if 'reason' in video_info:
1287                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1288                         else:
1289                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1290                         return
1291
1292                 # Start extracting information
1293                 self.report_information_extraction(video_id)
1294
1295                 # uploader
1296                 if 'author' not in video_info:
1297                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1298                         return
1299                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1300
1301                 # title
1302                 if 'title' not in video_info:
1303                         self._downloader.trouble(u'ERROR: unable to extract video title')
1304                         return
1305                 video_title = urllib.unquote_plus(video_info['title'][0])
1306                 video_title = video_title.decode('utf-8')
1307                 video_title = sanitize_title(video_title)
1308
1309                 # simplified title
1310                 simple_title = _simplify_title(video_title)
1311
1312                 # thumbnail image
1313                 if 'thumbnail_url' not in video_info:
1314                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1315                         video_thumbnail = ''
1316                 else:   # don't panic if we can't find it
1317                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1318
1319                 # upload date
1320                 upload_date = u'NA'
1321                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1322                 if mobj is not None:
1323                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1324                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1325                         for expression in format_expressions:
1326                                 try:
1327                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1328                                 except:
1329                                         pass
1330
1331                 # description
1332                 try:
1333                         lxml.etree
1334                 except NameError:
1335                         video_description = u'No description available.'
1336                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1337                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1338                                 if mobj is not None:
1339                                         video_description = mobj.group(1).decode('utf-8')
1340                 else:
1341                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1342                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1343                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1344                         # TODO use another parser
1345
1346                 # token
1347                 video_token = urllib.unquote_plus(video_info['token'][0])
1348
1349                 # Decide which formats to download
1350                 req_format = self._downloader.params.get('format', None)
1351
1352                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1353                         self.report_rtmp_download()
1354                         video_url_list = [(None, video_info['conn'][0])]
1355                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1356                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1357                         url_data = [parse_qs(uds) for uds in url_data_strs]
1358                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1359                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1360
1361                         format_limit = self._downloader.params.get('format_limit', None)
1362                         if format_limit is not None and format_limit in self._available_formats:
1363                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1364                         else:
1365                                 format_list = self._available_formats
1366                         existing_formats = [x for x in format_list if x in url_map]
1367                         if len(existing_formats) == 0:
1368                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1369                                 return
1370                         if self._downloader.params.get('listformats', None):
1371                                 self._print_formats(existing_formats)
1372                                 return
1373                         if req_format is None or req_format == 'best':
1374                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1375                         elif req_format == 'worst':
1376                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1377                         elif req_format in ('-1', 'all'):
1378                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1379                         else:
1380                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1381                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1382                                 req_formats = req_format.split('/')
1383                                 video_url_list = None
1384                                 for rf in req_formats:
1385                                         if rf in url_map:
1386                                                 video_url_list = [(rf, url_map[rf])]
1387                                                 break
1388                                 if video_url_list is None:
1389                                         self._downloader.trouble(u'ERROR: requested format not available')
1390                                         return
1391                 else:
1392                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1393                         return
1394
1395                 for format_param, video_real_url in video_url_list:
1396                         # At this point we have a new video
1397                         self._downloader.increment_downloads()
1398
1399                         # Extension
1400                         video_extension = self._video_extensions.get(format_param, 'flv')
1401
1402                         try:
1403                                 # Process video information
1404                                 self._downloader.process_info({
1405                                         'id':           video_id.decode('utf-8'),
1406                                         'url':          video_real_url.decode('utf-8'),
1407                                         'uploader':     video_uploader.decode('utf-8'),
1408                                         'upload_date':  upload_date,
1409                                         'title':        video_title,
1410                                         'stitle':       simple_title,
1411                                         'ext':          video_extension.decode('utf-8'),
1412                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1413                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1414                                         'description':  video_description,
1415                                         'player_url':   player_url,
1416                                 })
1417                         except UnavailableVideoError, err:
1418                                 self._downloader.trouble(u'\nERROR: unable to download video')
1419
1420
1421 class MetacafeIE(InfoExtractor):
1422         """Information Extractor for metacafe.com."""
1423
1424         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1425         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1426         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1427         _youtube_ie = None
1428         IE_NAME = u'metacafe'
1429
1430         def __init__(self, youtube_ie, downloader=None):
1431                 InfoExtractor.__init__(self, downloader)
1432                 self._youtube_ie = youtube_ie
1433
1434         def report_disclaimer(self):
1435                 """Report disclaimer retrieval."""
1436                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1437
1438         def report_age_confirmation(self):
1439                 """Report attempt to confirm age."""
1440                 self._downloader.to_screen(u'[metacafe] Confirming age')
1441
1442         def report_download_webpage(self, video_id):
1443                 """Report webpage download."""
1444                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1445
1446         def report_extraction(self, video_id):
1447                 """Report information extraction."""
1448                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1449
1450         def _real_initialize(self):
1451                 # Retrieve disclaimer
1452                 request = urllib2.Request(self._DISCLAIMER)
1453                 try:
1454                         self.report_disclaimer()
1455                         disclaimer = urllib2.urlopen(request).read()
1456                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1457                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1458                         return
1459
1460                 # Confirm age
1461                 disclaimer_form = {
1462                         'filters': '0',
1463                         'submit': "Continue - I'm over 18",
1464                         }
1465                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1466                 try:
1467                         self.report_age_confirmation()
1468                         disclaimer = urllib2.urlopen(request).read()
1469                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1470                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1471                         return
1472
1473         def _real_extract(self, url):
1474                 # Extract id and simplified title from URL
1475                 mobj = re.match(self._VALID_URL, url)
1476                 if mobj is None:
1477                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1478                         return
1479
1480                 video_id = mobj.group(1)
1481
1482                 # Check if video comes from YouTube
1483                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1484                 if mobj2 is not None:
1485                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1486                         return
1487
1488                 # At this point we have a new video
1489                 self._downloader.increment_downloads()
1490
1491                 simple_title = mobj.group(2).decode('utf-8')
1492
1493                 # Retrieve video webpage to extract further information
1494                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1495                 try:
1496                         self.report_download_webpage(video_id)
1497                         webpage = urllib2.urlopen(request).read()
1498                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1499                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1500                         return
1501
1502                 # Extract URL, uploader and title from webpage
1503                 self.report_extraction(video_id)
1504                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1505                 if mobj is not None:
1506                         mediaURL = urllib.unquote(mobj.group(1))
1507                         video_extension = mediaURL[-3:]
1508
1509                         # Extract gdaKey if available
1510                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1511                         if mobj is None:
1512                                 video_url = mediaURL
1513                         else:
1514                                 gdaKey = mobj.group(1)
1515                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1516                 else:
1517                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1518                         if mobj is None:
1519                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1520                                 return
1521                         vardict = parse_qs(mobj.group(1))
1522                         if 'mediaData' not in vardict:
1523                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1524                                 return
1525                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1526                         if mobj is None:
1527                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1528                                 return
1529                         mediaURL = mobj.group(1).replace('\\/', '/')
1530                         video_extension = mediaURL[-3:]
1531                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1532
1533                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1534                 if mobj is None:
1535                         self._downloader.trouble(u'ERROR: unable to extract title')
1536                         return
1537                 video_title = mobj.group(1).decode('utf-8')
1538                 video_title = sanitize_title(video_title)
1539
1540                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1541                 if mobj is None:
1542                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1543                         return
1544                 video_uploader = mobj.group(1)
1545
1546                 try:
1547                         # Process video information
1548                         self._downloader.process_info({
1549                                 'id':           video_id.decode('utf-8'),
1550                                 'url':          video_url.decode('utf-8'),
1551                                 'uploader':     video_uploader.decode('utf-8'),
1552                                 'upload_date':  u'NA',
1553                                 'title':        video_title,
1554                                 'stitle':       simple_title,
1555                                 'ext':          video_extension.decode('utf-8'),
1556                                 'format':       u'NA',
1557                                 'player_url':   None,
1558                         })
1559                 except UnavailableVideoError:
1560                         self._downloader.trouble(u'\nERROR: unable to download video')
1561
1562
1563 class DailymotionIE(InfoExtractor):
1564         """Information Extractor for Dailymotion"""
1565
1566         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1567         IE_NAME = u'dailymotion'
1568
1569         def __init__(self, downloader=None):
1570                 InfoExtractor.__init__(self, downloader)
1571
1572         def report_download_webpage(self, video_id):
1573                 """Report webpage download."""
1574                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1575
1576         def report_extraction(self, video_id):
1577                 """Report information extraction."""
1578                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1579
1580         def _real_extract(self, url):
1581                 # Extract id and simplified title from URL
1582                 mobj = re.match(self._VALID_URL, url)
1583                 if mobj is None:
1584                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1585                         return
1586
1587                 # At this point we have a new video
1588                 self._downloader.increment_downloads()
1589                 video_id = mobj.group(1)
1590
1591                 simple_title = mobj.group(2).decode('utf-8')
1592                 video_extension = 'flv'
1593
1594                 # Retrieve video webpage to extract further information
1595                 request = urllib2.Request(url)
1596                 request.add_header('Cookie', 'family_filter=off')
1597                 try:
1598                         self.report_download_webpage(video_id)
1599                         webpage = urllib2.urlopen(request).read()
1600                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1601                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1602                         return
1603
1604                 # Extract URL, uploader and title from webpage
1605                 self.report_extraction(video_id)
1606                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1607                 if mobj is None:
1608                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1609                         return
1610                 sequence = urllib.unquote(mobj.group(1))
1611                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1612                 if mobj is None:
1613                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1614                         return
1615                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1616
1617                 # if needed add http://www.dailymotion.com/ if relative URL
1618
1619                 video_url = mediaURL
1620
1621                 mobj = re.search(r'(?im)<title>\s*(.+)\s*-\s*Video\s+Dailymotion</title>', webpage)
1622                 if mobj is None:
1623                         self._downloader.trouble(u'ERROR: unable to extract title')
1624                         return
1625                 video_title = mobj.group(1).decode('utf-8')
1626                 video_title = sanitize_title(video_title)
1627
1628                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1629                 if mobj is None:
1630                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1631                         return
1632                 video_uploader = mobj.group(1)
1633
1634                 try:
1635                         # Process video information
1636                         self._downloader.process_info({
1637                                 'id':           video_id.decode('utf-8'),
1638                                 'url':          video_url.decode('utf-8'),
1639                                 'uploader':     video_uploader.decode('utf-8'),
1640                                 'upload_date':  u'NA',
1641                                 'title':        video_title,
1642                                 'stitle':       simple_title,
1643                                 'ext':          video_extension.decode('utf-8'),
1644                                 'format':       u'NA',
1645                                 'player_url':   None,
1646                         })
1647                 except UnavailableVideoError:
1648                         self._downloader.trouble(u'\nERROR: unable to download video')
1649
1650
1651 class GoogleIE(InfoExtractor):
1652         """Information extractor for video.google.com."""
1653
1654         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1655         IE_NAME = u'video.google'
1656
1657         def __init__(self, downloader=None):
1658                 InfoExtractor.__init__(self, downloader)
1659
1660         def report_download_webpage(self, video_id):
1661                 """Report webpage download."""
1662                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1663
1664         def report_extraction(self, video_id):
1665                 """Report information extraction."""
1666                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1667
1668         def _real_extract(self, url):
1669                 # Extract id from URL
1670                 mobj = re.match(self._VALID_URL, url)
1671                 if mobj is None:
1672                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1673                         return
1674
1675                 # At this point we have a new video
1676                 self._downloader.increment_downloads()
1677                 video_id = mobj.group(1)
1678
1679                 video_extension = 'mp4'
1680
1681                 # Retrieve video webpage to extract further information
1682                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1683                 try:
1684                         self.report_download_webpage(video_id)
1685                         webpage = urllib2.urlopen(request).read()
1686                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1687                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1688                         return
1689
1690                 # Extract URL, uploader, and title from webpage
1691                 self.report_extraction(video_id)
1692                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1693                 if mobj is None:
1694                         video_extension = 'flv'
1695                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1696                 if mobj is None:
1697                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1698                         return
1699                 mediaURL = urllib.unquote(mobj.group(1))
1700                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1701                 mediaURL = mediaURL.replace('\\x26', '\x26')
1702
1703                 video_url = mediaURL
1704
1705                 mobj = re.search(r'<title>(.*)</title>', webpage)
1706                 if mobj is None:
1707                         self._downloader.trouble(u'ERROR: unable to extract title')
1708                         return
1709                 video_title = mobj.group(1).decode('utf-8')
1710                 video_title = sanitize_title(video_title)
1711                 simple_title = _simplify_title(video_title)
1712
1713                 # Extract video description
1714                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1715                 if mobj is None:
1716                         self._downloader.trouble(u'ERROR: unable to extract video description')
1717                         return
1718                 video_description = mobj.group(1).decode('utf-8')
1719                 if not video_description:
1720                         video_description = 'No description available.'
1721
1722                 # Extract video thumbnail
1723                 if self._downloader.params.get('forcethumbnail', False):
1724                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1725                         try:
1726                                 webpage = urllib2.urlopen(request).read()
1727                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1728                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1729                                 return
1730                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1731                         if mobj is None:
1732                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1733                                 return
1734                         video_thumbnail = mobj.group(1)
1735                 else:   # we need something to pass to process_info
1736                         video_thumbnail = ''
1737
1738                 try:
1739                         # Process video information
1740                         self._downloader.process_info({
1741                                 'id':           video_id.decode('utf-8'),
1742                                 'url':          video_url.decode('utf-8'),
1743                                 'uploader':     u'NA',
1744                                 'upload_date':  u'NA',
1745                                 'title':        video_title,
1746                                 'stitle':       simple_title,
1747                                 'ext':          video_extension.decode('utf-8'),
1748                                 'format':       u'NA',
1749                                 'player_url':   None,
1750                         })
1751                 except UnavailableVideoError:
1752                         self._downloader.trouble(u'\nERROR: unable to download video')
1753
1754
1755 class PhotobucketIE(InfoExtractor):
1756         """Information extractor for photobucket.com."""
1757
1758         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1759         IE_NAME = u'photobucket'
1760
1761         def __init__(self, downloader=None):
1762                 InfoExtractor.__init__(self, downloader)
1763
1764         def report_download_webpage(self, video_id):
1765                 """Report webpage download."""
1766                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1767
1768         def report_extraction(self, video_id):
1769                 """Report information extraction."""
1770                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1771
1772         def _real_extract(self, url):
1773                 # Extract id from URL
1774                 mobj = re.match(self._VALID_URL, url)
1775                 if mobj is None:
1776                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1777                         return
1778
1779                 # At this point we have a new video
1780                 self._downloader.increment_downloads()
1781                 video_id = mobj.group(1)
1782
1783                 video_extension = 'flv'
1784
1785                 # Retrieve video webpage to extract further information
1786                 request = urllib2.Request(url)
1787                 try:
1788                         self.report_download_webpage(video_id)
1789                         webpage = urllib2.urlopen(request).read()
1790                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1791                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1792                         return
1793
1794                 # Extract URL, uploader, and title from webpage
1795                 self.report_extraction(video_id)
1796                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1797                 if mobj is None:
1798                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1799                         return
1800                 mediaURL = urllib.unquote(mobj.group(1))
1801
1802                 video_url = mediaURL
1803
1804                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1805                 if mobj is None:
1806                         self._downloader.trouble(u'ERROR: unable to extract title')
1807                         return
1808                 video_title = mobj.group(1).decode('utf-8')
1809                 video_title = sanitize_title(video_title)
1810                 simple_title = _simplify_title(vide_title)
1811
1812                 video_uploader = mobj.group(2).decode('utf-8')
1813
1814                 try:
1815                         # Process video information
1816                         self._downloader.process_info({
1817                                 'id':           video_id.decode('utf-8'),
1818                                 'url':          video_url.decode('utf-8'),
1819                                 'uploader':     video_uploader,
1820                                 'upload_date':  u'NA',
1821                                 'title':        video_title,
1822                                 'stitle':       simple_title,
1823                                 'ext':          video_extension.decode('utf-8'),
1824                                 'format':       u'NA',
1825                                 'player_url':   None,
1826                         })
1827                 except UnavailableVideoError:
1828                         self._downloader.trouble(u'\nERROR: unable to download video')
1829
1830
1831 class YahooIE(InfoExtractor):
1832         """Information extractor for video.yahoo.com."""
1833
1834         # _VALID_URL matches all Yahoo! Video URLs
1835         # _VPAGE_URL matches only the extractable '/watch/' URLs
1836         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1837         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1838         IE_NAME = u'video.yahoo'
1839
1840         def __init__(self, downloader=None):
1841                 InfoExtractor.__init__(self, downloader)
1842
1843         def report_download_webpage(self, video_id):
1844                 """Report webpage download."""
1845                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1846
1847         def report_extraction(self, video_id):
1848                 """Report information extraction."""
1849                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1850
1851         def _real_extract(self, url, new_video=True):
1852                 # Extract ID from URL
1853                 mobj = re.match(self._VALID_URL, url)
1854                 if mobj is None:
1855                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1856                         return
1857
1858                 # At this point we have a new video
1859                 self._downloader.increment_downloads()
1860                 video_id = mobj.group(2)
1861                 video_extension = 'flv'
1862
1863                 # Rewrite valid but non-extractable URLs as
1864                 # extractable English language /watch/ URLs
1865                 if re.match(self._VPAGE_URL, url) is None:
1866                         request = urllib2.Request(url)
1867                         try:
1868                                 webpage = urllib2.urlopen(request).read()
1869                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1870                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1871                                 return
1872
1873                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1874                         if mobj is None:
1875                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1876                                 return
1877                         yahoo_id = mobj.group(1)
1878
1879                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1880                         if mobj is None:
1881                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1882                                 return
1883                         yahoo_vid = mobj.group(1)
1884
1885                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1886                         return self._real_extract(url, new_video=False)
1887
1888                 # Retrieve video webpage to extract further information
1889                 request = urllib2.Request(url)
1890                 try:
1891                         self.report_download_webpage(video_id)
1892                         webpage = urllib2.urlopen(request).read()
1893                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1894                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1895                         return
1896
1897                 # Extract uploader and title from webpage
1898                 self.report_extraction(video_id)
1899                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1900                 if mobj is None:
1901                         self._downloader.trouble(u'ERROR: unable to extract video title')
1902                         return
1903                 video_title = mobj.group(1).decode('utf-8')
1904                 simple_title = _simplify_title(video_title)
1905
1906                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1907                 if mobj is None:
1908                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1909                         return
1910                 video_uploader = mobj.group(1).decode('utf-8')
1911
1912                 # Extract video thumbnail
1913                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1914                 if mobj is None:
1915                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1916                         return
1917                 video_thumbnail = mobj.group(1).decode('utf-8')
1918
1919                 # Extract video description
1920                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1921                 if mobj is None:
1922                         self._downloader.trouble(u'ERROR: unable to extract video description')
1923                         return
1924                 video_description = mobj.group(1).decode('utf-8')
1925                 if not video_description:
1926                         video_description = 'No description available.'
1927
1928                 # Extract video height and width
1929                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1930                 if mobj is None:
1931                         self._downloader.trouble(u'ERROR: unable to extract video height')
1932                         return
1933                 yv_video_height = mobj.group(1)
1934
1935                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1936                 if mobj is None:
1937                         self._downloader.trouble(u'ERROR: unable to extract video width')
1938                         return
1939                 yv_video_width = mobj.group(1)
1940
1941                 # Retrieve video playlist to extract media URL
1942                 # I'm not completely sure what all these options are, but we
1943                 # seem to need most of them, otherwise the server sends a 401.
1944                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1945                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1946                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1947                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1948                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1949                 try:
1950                         self.report_download_webpage(video_id)
1951                         webpage = urllib2.urlopen(request).read()
1952                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1953                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1954                         return
1955
1956                 # Extract media URL from playlist XML
1957                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1958                 if mobj is None:
1959                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1960                         return
1961                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1962                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1963
1964                 try:
1965                         # Process video information
1966                         self._downloader.process_info({
1967                                 'id':           video_id.decode('utf-8'),
1968                                 'url':          video_url,
1969                                 'uploader':     video_uploader,
1970                                 'upload_date':  u'NA',
1971                                 'title':        video_title,
1972                                 'stitle':       simple_title,
1973                                 'ext':          video_extension.decode('utf-8'),
1974                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1975                                 'description':  video_description,
1976                                 'thumbnail':    video_thumbnail,
1977                                 'player_url':   None,
1978                         })
1979                 except UnavailableVideoError:
1980                         self._downloader.trouble(u'\nERROR: unable to download video')
1981
1982
1983 class VimeoIE(InfoExtractor):
1984         """Information extractor for vimeo.com."""
1985
1986         # _VALID_URL matches Vimeo URLs
1987         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1988         IE_NAME = u'vimeo'
1989
1990         def __init__(self, downloader=None):
1991                 InfoExtractor.__init__(self, downloader)
1992
1993         def report_download_webpage(self, video_id):
1994                 """Report webpage download."""
1995                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1996
1997         def report_extraction(self, video_id):
1998                 """Report information extraction."""
1999                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2000
2001         def _real_extract(self, url, new_video=True):
2002                 # Extract ID from URL
2003                 mobj = re.match(self._VALID_URL, url)
2004                 if mobj is None:
2005                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2006                         return
2007
2008                 # At this point we have a new video
2009                 self._downloader.increment_downloads()
2010                 video_id = mobj.group(1)
2011
2012                 # Retrieve video webpage to extract further information
2013                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2014                 try:
2015                         self.report_download_webpage(video_id)
2016                         webpage = urllib2.urlopen(request).read()
2017                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2018                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2019                         return
2020
2021                 # Now we begin extracting as much information as we can from what we
2022                 # retrieved. First we extract the information common to all extractors,
2023                 # and latter we extract those that are Vimeo specific.
2024                 self.report_extraction(video_id)
2025
2026                 # Extract title
2027                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2028                 if mobj is None:
2029                         self._downloader.trouble(u'ERROR: unable to extract video title')
2030                         return
2031                 video_title = mobj.group(1).decode('utf-8')
2032                 simple_title = _simplify_title(video_title)
2033
2034                 # Extract uploader
2035                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2036                 if mobj is None:
2037                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2038                         return
2039                 video_uploader = mobj.group(1).decode('utf-8')
2040
2041                 # Extract video thumbnail
2042                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2043                 if mobj is None:
2044                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2045                         return
2046                 video_thumbnail = mobj.group(1).decode('utf-8')
2047
2048                 # # Extract video description
2049                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2050                 # if mobj is None:
2051                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2052                 #       return
2053                 # video_description = mobj.group(1).decode('utf-8')
2054                 # if not video_description: video_description = 'No description available.'
2055                 video_description = 'Foo.'
2056
2057                 # Vimeo specific: extract request signature
2058                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2059                 if mobj is None:
2060                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2061                         return
2062                 sig = mobj.group(1).decode('utf-8')
2063
2064                 # Vimeo specific: extract video quality information
2065                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2066                 if mobj is None:
2067                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2068                         return
2069                 quality = mobj.group(1).decode('utf-8')
2070
2071                 if int(quality) == 1:
2072                         quality = 'hd'
2073                 else:
2074                         quality = 'sd'
2075
2076                 # Vimeo specific: Extract request signature expiration
2077                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2078                 if mobj is None:
2079                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2080                         return
2081                 sig_exp = mobj.group(1).decode('utf-8')
2082
2083                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2084
2085                 try:
2086                         # Process video information
2087                         self._downloader.process_info({
2088                                 'id':           video_id.decode('utf-8'),
2089                                 'url':          video_url,
2090                                 'uploader':     video_uploader,
2091                                 'upload_date':  u'NA',
2092                                 'title':        video_title,
2093                                 'stitle':       simple_title,
2094                                 'ext':          u'mp4',
2095                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2096                                 'description':  video_description,
2097                                 'thumbnail':    video_thumbnail,
2098                                 'description':  video_description,
2099                                 'player_url':   None,
2100                         })
2101                 except UnavailableVideoError:
2102                         self._downloader.trouble(u'ERROR: unable to download video')
2103
2104
2105 class GenericIE(InfoExtractor):
2106         """Generic last-resort information extractor."""
2107
2108         _VALID_URL = r'.*'
2109         IE_NAME = u'generic'
2110
2111         def __init__(self, downloader=None):
2112                 InfoExtractor.__init__(self, downloader)
2113
2114         def report_download_webpage(self, video_id):
2115                 """Report webpage download."""
2116                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2117                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2118
2119         def report_extraction(self, video_id):
2120                 """Report information extraction."""
2121                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2122
2123         def _real_extract(self, url):
2124                 # At this point we have a new video
2125                 self._downloader.increment_downloads()
2126
2127                 video_id = url.split('/')[-1]
2128                 request = urllib2.Request(url)
2129                 try:
2130                         self.report_download_webpage(video_id)
2131                         webpage = urllib2.urlopen(request).read()
2132                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2133                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2134                         return
2135                 except ValueError, err:
2136                         # since this is the last-resort InfoExtractor, if
2137                         # this error is thrown, it'll be thrown here
2138                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2139                         return
2140
2141                 self.report_extraction(video_id)
2142                 # Start with something easy: JW Player in SWFObject
2143                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2144                 if mobj is None:
2145                         # Broaden the search a little bit
2146                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2147                 if mobj is None:
2148                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2149                         return
2150
2151                 # It's possible that one of the regexes
2152                 # matched, but returned an empty group:
2153                 if mobj.group(1) is None:
2154                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2155                         return
2156
2157                 video_url = urllib.unquote(mobj.group(1))
2158                 video_id = os.path.basename(video_url)
2159
2160                 # here's a fun little line of code for you:
2161                 video_extension = os.path.splitext(video_id)[1][1:]
2162                 video_id = os.path.splitext(video_id)[0]
2163
2164                 # it's tempting to parse this further, but you would
2165                 # have to take into account all the variations like
2166                 #   Video Title - Site Name
2167                 #   Site Name | Video Title
2168                 #   Video Title - Tagline | Site Name
2169                 # and so on and so forth; it's just not practical
2170                 mobj = re.search(r'<title>(.*)</title>', webpage)
2171                 if mobj is None:
2172                         self._downloader.trouble(u'ERROR: unable to extract title')
2173                         return
2174                 video_title = mobj.group(1).decode('utf-8')
2175                 video_title = sanitize_title(video_title)
2176                 simple_title = _simplify_title(video_title)
2177
2178                 # video uploader is domain name
2179                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2180                 if mobj is None:
2181                         self._downloader.trouble(u'ERROR: unable to extract title')
2182                         return
2183                 video_uploader = mobj.group(1).decode('utf-8')
2184
2185                 try:
2186                         # Process video information
2187                         self._downloader.process_info({
2188                                 'id':           video_id.decode('utf-8'),
2189                                 'url':          video_url.decode('utf-8'),
2190                                 'uploader':     video_uploader,
2191                                 'upload_date':  u'NA',
2192                                 'title':        video_title,
2193                                 'stitle':       simple_title,
2194                                 'ext':          video_extension.decode('utf-8'),
2195                                 'format':       u'NA',
2196                                 'player_url':   None,
2197                         })
2198                 except UnavailableVideoError, err:
2199                         self._downloader.trouble(u'\nERROR: unable to download video')
2200
2201
2202 class YoutubeSearchIE(InfoExtractor):
2203         """Information Extractor for YouTube search queries."""
2204         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2205         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2206         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2207         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2208         _youtube_ie = None
2209         _max_youtube_results = 1000
2210         IE_NAME = u'youtube:search'
2211
2212         def __init__(self, youtube_ie, downloader=None):
2213                 InfoExtractor.__init__(self, downloader)
2214                 self._youtube_ie = youtube_ie
2215
2216         def report_download_page(self, query, pagenum):
2217                 """Report attempt to download playlist page with given number."""
2218                 query = query.decode(preferredencoding())
2219                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2220
2221         def _real_initialize(self):
2222                 self._youtube_ie.initialize()
2223
2224         def _real_extract(self, query):
2225                 mobj = re.match(self._VALID_URL, query)
2226                 if mobj is None:
2227                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2228                         return
2229
2230                 prefix, query = query.split(':')
2231                 prefix = prefix[8:]
2232                 query = query.encode('utf-8')
2233                 if prefix == '':
2234                         self._download_n_results(query, 1)
2235                         return
2236                 elif prefix == 'all':
2237                         self._download_n_results(query, self._max_youtube_results)
2238                         return
2239                 else:
2240                         try:
2241                                 n = long(prefix)
2242                                 if n <= 0:
2243                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2244                                         return
2245                                 elif n > self._max_youtube_results:
2246                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2247                                         n = self._max_youtube_results
2248                                 self._download_n_results(query, n)
2249                                 return
2250                         except ValueError: # parsing prefix as integer fails
2251                                 self._download_n_results(query, 1)
2252                                 return
2253
2254         def _download_n_results(self, query, n):
2255                 """Downloads a specified number of results for a query"""
2256
2257                 video_ids = []
2258                 already_seen = set()
2259                 pagenum = 1
2260
2261                 while True:
2262                         self.report_download_page(query, pagenum)
2263                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2264                         request = urllib2.Request(result_url)
2265                         try:
2266                                 page = urllib2.urlopen(request).read()
2267                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2268                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2269                                 return
2270
2271                         # Extract video identifiers
2272                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2273                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2274                                 if video_id not in already_seen:
2275                                         video_ids.append(video_id)
2276                                         already_seen.add(video_id)
2277                                         if len(video_ids) == n:
2278                                                 # Specified n videos reached
2279                                                 for id in video_ids:
2280                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2281                                                 return
2282
2283                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2284                                 for id in video_ids:
2285                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2286                                 return
2287
2288                         pagenum = pagenum + 1
2289
2290
2291 class GoogleSearchIE(InfoExtractor):
2292         """Information Extractor for Google Video search queries."""
2293         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2294         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2295         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2296         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2297         _google_ie = None
2298         _max_google_results = 1000
2299         IE_NAME = u'video.google:search'
2300
2301         def __init__(self, google_ie, downloader=None):
2302                 InfoExtractor.__init__(self, downloader)
2303                 self._google_ie = google_ie
2304
2305         def report_download_page(self, query, pagenum):
2306                 """Report attempt to download playlist page with given number."""
2307                 query = query.decode(preferredencoding())
2308                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2309
2310         def _real_initialize(self):
2311                 self._google_ie.initialize()
2312
2313         def _real_extract(self, query):
2314                 mobj = re.match(self._VALID_URL, query)
2315                 if mobj is None:
2316                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2317                         return
2318
2319                 prefix, query = query.split(':')
2320                 prefix = prefix[8:]
2321                 query = query.encode('utf-8')
2322                 if prefix == '':
2323                         self._download_n_results(query, 1)
2324                         return
2325                 elif prefix == 'all':
2326                         self._download_n_results(query, self._max_google_results)
2327                         return
2328                 else:
2329                         try:
2330                                 n = long(prefix)
2331                                 if n <= 0:
2332                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2333                                         return
2334                                 elif n > self._max_google_results:
2335                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2336                                         n = self._max_google_results
2337                                 self._download_n_results(query, n)
2338                                 return
2339                         except ValueError: # parsing prefix as integer fails
2340                                 self._download_n_results(query, 1)
2341                                 return
2342
2343         def _download_n_results(self, query, n):
2344                 """Downloads a specified number of results for a query"""
2345
2346                 video_ids = []
2347                 already_seen = set()
2348                 pagenum = 1
2349
2350                 while True:
2351                         self.report_download_page(query, pagenum)
2352                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2353                         request = urllib2.Request(result_url)
2354                         try:
2355                                 page = urllib2.urlopen(request).read()
2356                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2357                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2358                                 return
2359
2360                         # Extract video identifiers
2361                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2362                                 video_id = mobj.group(1)
2363                                 if video_id not in already_seen:
2364                                         video_ids.append(video_id)
2365                                         already_seen.add(video_id)
2366                                         if len(video_ids) == n:
2367                                                 # Specified n videos reached
2368                                                 for id in video_ids:
2369                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2370                                                 return
2371
2372                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2373                                 for id in video_ids:
2374                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2375                                 return
2376
2377                         pagenum = pagenum + 1
2378
2379
2380 class YahooSearchIE(InfoExtractor):
2381         """Information Extractor for Yahoo! Video search queries."""
2382         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2383         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2384         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2385         _MORE_PAGES_INDICATOR = r'\s*Next'
2386         _yahoo_ie = None
2387         _max_yahoo_results = 1000
2388         IE_NAME = u'video.yahoo:search'
2389
2390         def __init__(self, yahoo_ie, downloader=None):
2391                 InfoExtractor.__init__(self, downloader)
2392                 self._yahoo_ie = yahoo_ie
2393
2394         def report_download_page(self, query, pagenum):
2395                 """Report attempt to download playlist page with given number."""
2396                 query = query.decode(preferredencoding())
2397                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2398
2399         def _real_initialize(self):
2400                 self._yahoo_ie.initialize()
2401
2402         def _real_extract(self, query):
2403                 mobj = re.match(self._VALID_URL, query)
2404                 if mobj is None:
2405                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2406                         return
2407
2408                 prefix, query = query.split(':')
2409                 prefix = prefix[8:]
2410                 query = query.encode('utf-8')
2411                 if prefix == '':
2412                         self._download_n_results(query, 1)
2413                         return
2414                 elif prefix == 'all':
2415                         self._download_n_results(query, self._max_yahoo_results)
2416                         return
2417                 else:
2418                         try:
2419                                 n = long(prefix)
2420                                 if n <= 0:
2421                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2422                                         return
2423                                 elif n > self._max_yahoo_results:
2424                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2425                                         n = self._max_yahoo_results
2426                                 self._download_n_results(query, n)
2427                                 return
2428                         except ValueError: # parsing prefix as integer fails
2429                                 self._download_n_results(query, 1)
2430                                 return
2431
2432         def _download_n_results(self, query, n):
2433                 """Downloads a specified number of results for a query"""
2434
2435                 video_ids = []
2436                 already_seen = set()
2437                 pagenum = 1
2438
2439                 while True:
2440                         self.report_download_page(query, pagenum)
2441                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2442                         request = urllib2.Request(result_url)
2443                         try:
2444                                 page = urllib2.urlopen(request).read()
2445                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2446                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2447                                 return
2448
2449                         # Extract video identifiers
2450                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2451                                 video_id = mobj.group(1)
2452                                 if video_id not in already_seen:
2453                                         video_ids.append(video_id)
2454                                         already_seen.add(video_id)
2455                                         if len(video_ids) == n:
2456                                                 # Specified n videos reached
2457                                                 for id in video_ids:
2458                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2459                                                 return
2460
2461                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2462                                 for id in video_ids:
2463                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2464                                 return
2465
2466                         pagenum = pagenum + 1
2467
2468
2469 class YoutubePlaylistIE(InfoExtractor):
2470         """Information Extractor for YouTube playlists."""
2471
2472         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2473         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2474         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2475         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2476         _youtube_ie = None
2477         IE_NAME = u'youtube:playlist'
2478
2479         def __init__(self, youtube_ie, downloader=None):
2480                 InfoExtractor.__init__(self, downloader)
2481                 self._youtube_ie = youtube_ie
2482
2483         def report_download_page(self, playlist_id, pagenum):
2484                 """Report attempt to download playlist page with given number."""
2485                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2486
2487         def _real_initialize(self):
2488                 self._youtube_ie.initialize()
2489
2490         def _real_extract(self, url):
2491                 # Extract playlist id
2492                 mobj = re.match(self._VALID_URL, url)
2493                 if mobj is None:
2494                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2495                         return
2496
2497                 # Single video case
2498                 if mobj.group(3) is not None:
2499                         self._youtube_ie.extract(mobj.group(3))
2500                         return
2501
2502                 # Download playlist pages
2503                 # prefix is 'p' as default for playlists but there are other types that need extra care
2504                 playlist_prefix = mobj.group(1)
2505                 if playlist_prefix == 'a':
2506                         playlist_access = 'artist'
2507                 else:
2508                         playlist_prefix = 'p'
2509                         playlist_access = 'view_play_list'
2510                 playlist_id = mobj.group(2)
2511                 video_ids = []
2512                 pagenum = 1
2513
2514                 while True:
2515                         self.report_download_page(playlist_id, pagenum)
2516                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2517                         request = urllib2.Request(url)
2518                         try:
2519                                 page = urllib2.urlopen(request).read()
2520                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2521                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2522                                 return
2523
2524                         # Extract video identifiers
2525                         ids_in_page = []
2526                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2527                                 if mobj.group(1) not in ids_in_page:
2528                                         ids_in_page.append(mobj.group(1))
2529                         video_ids.extend(ids_in_page)
2530
2531                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2532                                 break
2533                         pagenum = pagenum + 1
2534
2535                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2536                 playlistend = self._downloader.params.get('playlistend', -1)
2537                 video_ids = video_ids[playliststart:playlistend]
2538
2539                 for id in video_ids:
2540                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2541                 return
2542
2543
2544 class YoutubeUserIE(InfoExtractor):
2545         """Information Extractor for YouTube users."""
2546
2547         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2548         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2549         _GDATA_PAGE_SIZE = 50
2550         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2551         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2552         _youtube_ie = None
2553         IE_NAME = u'youtube:user'
2554
2555         def __init__(self, youtube_ie, downloader=None):
2556                 InfoExtractor.__init__(self, downloader)
2557                 self._youtube_ie = youtube_ie
2558
2559         def report_download_page(self, username, start_index):
2560                 """Report attempt to download user page."""
2561                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2562                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2563
2564         def _real_initialize(self):
2565                 self._youtube_ie.initialize()
2566
2567         def _real_extract(self, url):
2568                 # Extract username
2569                 mobj = re.match(self._VALID_URL, url)
2570                 if mobj is None:
2571                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2572                         return
2573
2574                 username = mobj.group(1)
2575
2576                 # Download video ids using YouTube Data API. Result size per
2577                 # query is limited (currently to 50 videos) so we need to query
2578                 # page by page until there are no video ids - it means we got
2579                 # all of them.
2580
2581                 video_ids = []
2582                 pagenum = 0
2583
2584                 while True:
2585                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2586                         self.report_download_page(username, start_index)
2587
2588                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2589
2590                         try:
2591                                 page = urllib2.urlopen(request).read()
2592                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2593                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2594                                 return
2595
2596                         # Extract video identifiers
2597                         ids_in_page = []
2598
2599                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2600                                 if mobj.group(1) not in ids_in_page:
2601                                         ids_in_page.append(mobj.group(1))
2602
2603                         video_ids.extend(ids_in_page)
2604
2605                         # A little optimization - if current page is not
2606                         # "full", ie. does not contain PAGE_SIZE video ids then
2607                         # we can assume that this page is the last one - there
2608                         # are no more ids on further pages - no need to query
2609                         # again.
2610
2611                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2612                                 break
2613
2614                         pagenum += 1
2615
2616                 all_ids_count = len(video_ids)
2617                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2618                 playlistend = self._downloader.params.get('playlistend', -1)
2619
2620                 if playlistend == -1:
2621                         video_ids = video_ids[playliststart:]
2622                 else:
2623                         video_ids = video_ids[playliststart:playlistend]
2624
2625                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2626                                 (username, all_ids_count, len(video_ids)))
2627
2628                 for video_id in video_ids:
2629                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2630
2631
2632 class DepositFilesIE(InfoExtractor):
2633         """Information extractor for depositfiles.com"""
2634
2635         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2636         IE_NAME = u'DepositFiles'
2637
2638         def __init__(self, downloader=None):
2639                 InfoExtractor.__init__(self, downloader)
2640
2641         def report_download_webpage(self, file_id):
2642                 """Report webpage download."""
2643                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2644
2645         def report_extraction(self, file_id):
2646                 """Report information extraction."""
2647                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2648
2649         def _real_extract(self, url):
2650                 # At this point we have a new file
2651                 self._downloader.increment_downloads()
2652
2653                 file_id = url.split('/')[-1]
2654                 # Rebuild url in english locale
2655                 url = 'http://depositfiles.com/en/files/' + file_id
2656
2657                 # Retrieve file webpage with 'Free download' button pressed
2658                 free_download_indication = { 'gateway_result' : '1' }
2659                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2660                 try:
2661                         self.report_download_webpage(file_id)
2662                         webpage = urllib2.urlopen(request).read()
2663                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2664                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2665                         return
2666
2667                 # Search for the real file URL
2668                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2669                 if (mobj is None) or (mobj.group(1) is None):
2670                         # Try to figure out reason of the error.
2671                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2672                         if (mobj is not None) and (mobj.group(1) is not None):
2673                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2674                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2675                         else:
2676                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2677                         return
2678
2679                 file_url = mobj.group(1)
2680                 file_extension = os.path.splitext(file_url)[1][1:]
2681
2682                 # Search for file title
2683                 mobj = re.search(r'<b title="(.*?)">', webpage)
2684                 if mobj is None:
2685                         self._downloader.trouble(u'ERROR: unable to extract title')
2686                         return
2687                 file_title = mobj.group(1).decode('utf-8')
2688
2689                 try:
2690                         # Process file information
2691                         self._downloader.process_info({
2692                                 'id':           file_id.decode('utf-8'),
2693                                 'url':          file_url.decode('utf-8'),
2694                                 'uploader':     u'NA',
2695                                 'upload_date':  u'NA',
2696                                 'title':        file_title,
2697                                 'stitle':       file_title,
2698                                 'ext':          file_extension.decode('utf-8'),
2699                                 'format':       u'NA',
2700                                 'player_url':   None,
2701                         })
2702                 except UnavailableVideoError, err:
2703                         self._downloader.trouble(u'ERROR: unable to download file')
2704
2705
2706 class FacebookIE(InfoExtractor):
2707         """Information Extractor for Facebook"""
2708
2709         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2710         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2711         _NETRC_MACHINE = 'facebook'
2712         _available_formats = ['video', 'highqual', 'lowqual']
2713         _video_extensions = {
2714                 'video': 'mp4',
2715                 'highqual': 'mp4',
2716                 'lowqual': 'mp4',
2717         }
2718         IE_NAME = u'facebook'
2719
2720         def __init__(self, downloader=None):
2721                 InfoExtractor.__init__(self, downloader)
2722
2723         def _reporter(self, message):
2724                 """Add header and report message."""
2725                 self._downloader.to_screen(u'[facebook] %s' % message)
2726
2727         def report_login(self):
2728                 """Report attempt to log in."""
2729                 self._reporter(u'Logging in')
2730
2731         def report_video_webpage_download(self, video_id):
2732                 """Report attempt to download video webpage."""
2733                 self._reporter(u'%s: Downloading video webpage' % video_id)
2734
2735         def report_information_extraction(self, video_id):
2736                 """Report attempt to extract video information."""
2737                 self._reporter(u'%s: Extracting video information' % video_id)
2738
2739         def _parse_page(self, video_webpage):
2740                 """Extract video information from page"""
2741                 # General data
2742                 data = {'title': r'\("video_title", "(.*?)"\)',
2743                         'description': r'<div class="datawrap">(.*?)</div>',
2744                         'owner': r'\("video_owner_name", "(.*?)"\)',
2745                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2746                         }
2747                 video_info = {}
2748                 for piece in data.keys():
2749                         mobj = re.search(data[piece], video_webpage)
2750                         if mobj is not None:
2751                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2752
2753                 # Video urls
2754                 video_urls = {}
2755                 for fmt in self._available_formats:
2756                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2757                         if mobj is not None:
2758                                 # URL is in a Javascript segment inside an escaped Unicode format within
2759                                 # the generally utf-8 page
2760                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2761                 video_info['video_urls'] = video_urls
2762
2763                 return video_info
2764
2765         def _real_initialize(self):
2766                 if self._downloader is None:
2767                         return
2768
2769                 useremail = None
2770                 password = None
2771                 downloader_params = self._downloader.params
2772
2773                 # Attempt to use provided username and password or .netrc data
2774                 if downloader_params.get('username', None) is not None:
2775                         useremail = downloader_params['username']
2776                         password = downloader_params['password']
2777                 elif downloader_params.get('usenetrc', False):
2778                         try:
2779                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2780                                 if info is not None:
2781                                         useremail = info[0]
2782                                         password = info[2]
2783                                 else:
2784                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2785                         except (IOError, netrc.NetrcParseError), err:
2786                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2787                                 return
2788
2789                 if useremail is None:
2790                         return
2791
2792                 # Log in
2793                 login_form = {
2794                         'email': useremail,
2795                         'pass': password,
2796                         'login': 'Log+In'
2797                         }
2798                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2799                 try:
2800                         self.report_login()
2801                         login_results = urllib2.urlopen(request).read()
2802                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2803                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2804                                 return
2805                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2806                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2807                         return
2808
2809         def _real_extract(self, url):
2810                 mobj = re.match(self._VALID_URL, url)
2811                 if mobj is None:
2812                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2813                         return
2814                 video_id = mobj.group('ID')
2815
2816                 # Get video webpage
2817                 self.report_video_webpage_download(video_id)
2818                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2819                 try:
2820                         page = urllib2.urlopen(request)
2821                         video_webpage = page.read()
2822                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2823                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2824                         return
2825
2826                 # Start extracting information
2827                 self.report_information_extraction(video_id)
2828
2829                 # Extract information
2830                 video_info = self._parse_page(video_webpage)
2831
2832                 # uploader
2833                 if 'owner' not in video_info:
2834                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2835                         return
2836                 video_uploader = video_info['owner']
2837
2838                 # title
2839                 if 'title' not in video_info:
2840                         self._downloader.trouble(u'ERROR: unable to extract video title')
2841                         return
2842                 video_title = video_info['title']
2843                 video_title = video_title.decode('utf-8')
2844                 video_title = sanitize_title(video_title)
2845
2846                 simple_title = _simplify_title(video_title)
2847
2848                 # thumbnail image
2849                 if 'thumbnail' not in video_info:
2850                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2851                         video_thumbnail = ''
2852                 else:
2853                         video_thumbnail = video_info['thumbnail']
2854
2855                 # upload date
2856                 upload_date = u'NA'
2857                 if 'upload_date' in video_info:
2858                         upload_time = video_info['upload_date']
2859                         timetuple = email.utils.parsedate_tz(upload_time)
2860                         if timetuple is not None:
2861                                 try:
2862                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2863                                 except:
2864                                         pass
2865
2866                 # description
2867                 video_description = video_info.get('description', 'No description available.')
2868
2869                 url_map = video_info['video_urls']
2870                 if len(url_map.keys()) > 0:
2871                         # Decide which formats to download
2872                         req_format = self._downloader.params.get('format', None)
2873                         format_limit = self._downloader.params.get('format_limit', None)
2874
2875                         if format_limit is not None and format_limit in self._available_formats:
2876                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2877                         else:
2878                                 format_list = self._available_formats
2879                         existing_formats = [x for x in format_list if x in url_map]
2880                         if len(existing_formats) == 0:
2881                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2882                                 return
2883                         if req_format is None:
2884                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2885                         elif req_format == 'worst':
2886                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2887                         elif req_format == '-1':
2888                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2889                         else:
2890                                 # Specific format
2891                                 if req_format not in url_map:
2892                                         self._downloader.trouble(u'ERROR: requested format not available')
2893                                         return
2894                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2895
2896                 for format_param, video_real_url in video_url_list:
2897
2898                         # At this point we have a new video
2899                         self._downloader.increment_downloads()
2900
2901                         # Extension
2902                         video_extension = self._video_extensions.get(format_param, 'mp4')
2903
2904                         try:
2905                                 # Process video information
2906                                 self._downloader.process_info({
2907                                         'id':           video_id.decode('utf-8'),
2908                                         'url':          video_real_url.decode('utf-8'),
2909                                         'uploader':     video_uploader.decode('utf-8'),
2910                                         'upload_date':  upload_date,
2911                                         'title':        video_title,
2912                                         'stitle':       simple_title,
2913                                         'ext':          video_extension.decode('utf-8'),
2914                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2915                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2916                                         'description':  video_description.decode('utf-8'),
2917                                         'player_url':   None,
2918                                 })
2919                         except UnavailableVideoError, err:
2920                                 self._downloader.trouble(u'\nERROR: unable to download video')
2921
2922 class BlipTVIE(InfoExtractor):
2923         """Information extractor for blip.tv"""
2924
2925         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2926         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2927         IE_NAME = u'blip.tv'
2928
2929         def report_extraction(self, file_id):
2930                 """Report information extraction."""
2931                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2932
2933         def report_direct_download(self, title):
2934                 """Report information extraction."""
2935                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2936
2937         def _real_extract(self, url):
2938                 mobj = re.match(self._VALID_URL, url)
2939                 if mobj is None:
2940                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2941                         return
2942
2943                 if '?' in url:
2944                         cchar = '&'
2945                 else:
2946                         cchar = '?'
2947                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2948                 request = urllib2.Request(json_url)
2949                 self.report_extraction(mobj.group(1))
2950                 info = None
2951                 try:
2952                         urlh = urllib2.urlopen(request)
2953                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2954                                 basename = url.split('/')[-1]
2955                                 title,ext = os.path.splitext(basename)
2956                                 title = title.decode('UTF-8')
2957                                 ext = ext.replace('.', '')
2958                                 self.report_direct_download(title)
2959                                 info = {
2960                                         'id': title,
2961                                         'url': url,
2962                                         'title': title,
2963                                         'stitle': _simplify_title(title),
2964                                         'ext': ext,
2965                                         'urlhandle': urlh
2966                                 }
2967                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2968                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2969                         return
2970                 if info is None: # Regular URL
2971                         try:
2972                                 json_code = urlh.read()
2973                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2974                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2975                                 return
2976
2977                         try:
2978                                 json_data = json.loads(json_code)
2979                                 if 'Post' in json_data:
2980                                         data = json_data['Post']
2981                                 else:
2982                                         data = json_data
2983         
2984                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2985                                 video_url = data['media']['url']
2986                                 umobj = re.match(self._URL_EXT, video_url)
2987                                 if umobj is None:
2988                                         raise ValueError('Can not determine filename extension')
2989                                 ext = umobj.group(1)
2990         
2991                                 info = {
2992                                         'id': data['item_id'],
2993                                         'url': video_url,
2994                                         'uploader': data['display_name'],
2995                                         'upload_date': upload_date,
2996                                         'title': data['title'],
2997                                         'stitle': _simplify_title(data['title']),
2998                                         'ext': ext,
2999                                         'format': data['media']['mimeType'],
3000                                         'thumbnail': data['thumbnailUrl'],
3001                                         'description': data['description'],
3002                                         'player_url': data['embedUrl']
3003                                 }
3004                         except (ValueError,KeyError), err:
3005                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3006                                 return
3007
3008                 self._downloader.increment_downloads()
3009
3010                 try:
3011                         self._downloader.process_info(info)
3012                 except UnavailableVideoError, err:
3013                         self._downloader.trouble(u'\nERROR: unable to download video')
3014
3015
3016 class MyVideoIE(InfoExtractor):
3017         """Information Extractor for myvideo.de."""
3018
3019         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3020         IE_NAME = u'myvideo'
3021
3022         def __init__(self, downloader=None):
3023                 InfoExtractor.__init__(self, downloader)
3024         
3025         def report_download_webpage(self, video_id):
3026                 """Report webpage download."""
3027                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3028
3029         def report_extraction(self, video_id):
3030                 """Report information extraction."""
3031                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3032
3033         def _real_extract(self,url):
3034                 mobj = re.match(self._VALID_URL, url)
3035                 if mobj is None:
3036                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3037                         return
3038
3039                 video_id = mobj.group(1)
3040
3041                 # Get video webpage
3042                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3043                 try:
3044                         self.report_download_webpage(video_id)
3045                         webpage = urllib2.urlopen(request).read()
3046                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3047                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3048                         return
3049
3050                 self.report_extraction(video_id)
3051                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3052                                  webpage)
3053                 if mobj is None:
3054                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3055                         return
3056                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3057
3058                 mobj = re.search('<title>([^<]+)</title>', webpage)
3059                 if mobj is None:
3060                         self._downloader.trouble(u'ERROR: unable to extract title')
3061                         return
3062
3063                 video_title = mobj.group(1)
3064                 video_title = sanitize_title(video_title)
3065
3066                 simple_title = _simplify_title(video_title)
3067
3068                 try:
3069                         self._downloader.process_info({
3070                                 'id':           video_id,
3071                                 'url':          video_url,
3072                                 'uploader':     u'NA',
3073                                 'upload_date':  u'NA',
3074                                 'title':        video_title,
3075                                 'stitle':       simple_title,
3076                                 'ext':          u'flv',
3077                                 'format':       u'NA',
3078                                 'player_url':   None,
3079                         })
3080                 except UnavailableVideoError:
3081                         self._downloader.trouble(u'\nERROR: Unable to download video')
3082
3083 class ComedyCentralIE(InfoExtractor):
3084         """Information extractor for The Daily Show and Colbert Report """
3085
3086         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3087         IE_NAME = u'comedycentral'
3088
3089         def report_extraction(self, episode_id):
3090                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3091         
3092         def report_config_download(self, episode_id):
3093                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3094
3095         def report_index_download(self, episode_id):
3096                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3097
3098         def report_player_url(self, episode_id):
3099                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3100
3101         def _real_extract(self, url):
3102                 mobj = re.match(self._VALID_URL, url)
3103                 if mobj is None:
3104                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3105                         return
3106
3107                 if mobj.group('shortname'):
3108                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3109                                 url = u'http://www.thedailyshow.com/full-episodes/'
3110                         else:
3111                                 url = u'http://www.colbertnation.com/full-episodes/'
3112                         mobj = re.match(self._VALID_URL, url)
3113                         assert mobj is not None
3114
3115                 dlNewest = not mobj.group('episode')
3116                 if dlNewest:
3117                         epTitle = mobj.group('showname')
3118                 else:
3119                         epTitle = mobj.group('episode')
3120
3121                 req = urllib2.Request(url)
3122                 self.report_extraction(epTitle)
3123                 try:
3124                         htmlHandle = urllib2.urlopen(req)
3125                         html = htmlHandle.read()
3126                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3127                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3128                         return
3129                 if dlNewest:
3130                         url = htmlHandle.geturl()
3131                         mobj = re.match(self._VALID_URL, url)
3132                         if mobj is None:
3133                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3134                                 return
3135                         if mobj.group('episode') == '':
3136                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3137                                 return
3138                         epTitle = mobj.group('episode')
3139
3140                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3141                 if len(mMovieParams) == 0:
3142                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3143                         return
3144
3145                 playerUrl_raw = mMovieParams[0][0]
3146                 self.report_player_url(epTitle)
3147                 try:
3148                         urlHandle = urllib2.urlopen(playerUrl_raw)
3149                         playerUrl = urlHandle.geturl()
3150                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3151                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3152                         return
3153
3154                 uri = mMovieParams[0][1]
3155                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3156                 self.report_index_download(epTitle)
3157                 try:
3158                         indexXml = urllib2.urlopen(indexUrl).read()
3159                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3160                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3161                         return
3162
3163                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3164                 itemEls = idoc.findall('.//item')
3165                 for itemEl in itemEls:
3166                         mediaId = itemEl.findall('./guid')[0].text
3167                         shortMediaId = mediaId.split(':')[-1]
3168                         showId = mediaId.split(':')[-2].replace('.com', '')
3169                         officialTitle = itemEl.findall('./title')[0].text
3170                         officialDate = itemEl.findall('./pubDate')[0].text
3171
3172                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3173                                                 urllib.urlencode({'uri': mediaId}))
3174                         configReq = urllib2.Request(configUrl)
3175                         self.report_config_download(epTitle)
3176                         try:
3177                                 configXml = urllib2.urlopen(configReq).read()
3178                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3179                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3180                                 return
3181
3182                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3183                         turls = []
3184                         for rendition in cdoc.findall('.//rendition'):
3185                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3186                                 turls.append(finfo)
3187
3188                         if len(turls) == 0:
3189                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3190                                 continue
3191
3192                         # For now, just pick the highest bitrate
3193                         format,video_url = turls[-1]
3194
3195                         self._downloader.increment_downloads()
3196
3197                         effTitle = showId + u'-' + epTitle
3198                         info = {
3199                                 'id': shortMediaId,
3200                                 'url': video_url,
3201                                 'uploader': showId,
3202                                 'upload_date': officialDate,
3203                                 'title': effTitle,
3204                                 'stitle': _simplify_title(effTitle),
3205                                 'ext': 'mp4',
3206                                 'format': format,
3207                                 'thumbnail': None,
3208                                 'description': officialTitle,
3209                                 'player_url': playerUrl
3210                         }
3211
3212                         try:
3213                                 self._downloader.process_info(info)
3214                         except UnavailableVideoError, err:
3215                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3216                                 continue
3217
3218
3219 class EscapistIE(InfoExtractor):
3220         """Information extractor for The Escapist """
3221
3222         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3223         IE_NAME = u'escapist'
3224
3225         def report_extraction(self, showName):
3226                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3227
3228         def report_config_download(self, showName):
3229                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3230
3231         def _real_extract(self, url):
3232                 htmlParser = HTMLParser.HTMLParser()
3233
3234                 mobj = re.match(self._VALID_URL, url)
3235                 if mobj is None:
3236                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3237                         return
3238                 showName = mobj.group('showname')
3239                 videoId = mobj.group('episode')
3240
3241                 self.report_extraction(showName)
3242                 try:
3243                         webPage = urllib2.urlopen(url).read()
3244                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3245                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3246                         return
3247
3248                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3249                 description = htmlParser.unescape(descMatch.group(1))
3250                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3251                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3252                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3253                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3254                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3255                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3256
3257                 self.report_config_download(showName)
3258                 try:
3259                         configJSON = urllib2.urlopen(configUrl).read()
3260                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3261                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3262                         return
3263
3264                 # Technically, it's JavaScript, not JSON
3265                 configJSON = configJSON.replace("'", '"')
3266
3267                 try:
3268                         config = json.loads(configJSON)
3269                 except (ValueError,), err:
3270                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3271                         return
3272
3273                 playlist = config['playlist']
3274                 videoUrl = playlist[1]['url']
3275
3276                 self._downloader.increment_downloads()
3277                 info = {
3278                         'id': videoId,
3279                         'url': videoUrl,
3280                         'uploader': showName,
3281                         'upload_date': None,
3282                         'title': showName,
3283                         'stitle': _simplify_title(showName),
3284                         'ext': 'flv',
3285                         'format': 'flv',
3286                         'thumbnail': imgUrl,
3287                         'description': description,
3288                         'player_url': playerUrl,
3289                 }
3290
3291                 try:
3292                         self._downloader.process_info(info)
3293                 except UnavailableVideoError, err:
3294                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3295
3296
3297 class CollegeHumorIE(InfoExtractor):
3298         """Information extractor for collegehumor.com"""
3299
3300         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3301         IE_NAME = u'collegehumor'
3302
3303         def report_webpage(self, video_id):
3304                 """Report information extraction."""
3305                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3306
3307         def report_extraction(self, video_id):
3308                 """Report information extraction."""
3309                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3310
3311         def _real_extract(self, url):
3312                 htmlParser = HTMLParser.HTMLParser()
3313
3314                 mobj = re.match(self._VALID_URL, url)
3315                 if mobj is None:
3316                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3317                         return
3318                 video_id = mobj.group('videoid')
3319
3320                 self.report_webpage(video_id)
3321                 request = urllib2.Request(url)
3322                 try:
3323                         webpage = urllib2.urlopen(request).read()
3324                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3325                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3326                         return
3327
3328                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3329                 if m is None:
3330                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3331                         return
3332                 internal_video_id = m.group('internalvideoid')
3333
3334                 info = {
3335                         'id': video_id,
3336                         'internal_id': internal_video_id,
3337                 }
3338
3339                 self.report_extraction(video_id)
3340                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3341                 try:
3342                         metaXml = urllib2.urlopen(xmlUrl).read()
3343                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3344                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3345                         return
3346
3347                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3348                 try:
3349                         videoNode = mdoc.findall('./video')[0]
3350                         info['description'] = videoNode.findall('./description')[0].text
3351                         info['title'] = videoNode.findall('./caption')[0].text
3352                         info['stitle'] = _simplify_title(info['title'])
3353                         info['url'] = videoNode.findall('./file')[0].text
3354                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3355                         info['ext'] = info['url'].rpartition('.')[2]
3356                         info['format'] = info['ext']
3357                 except IndexError:
3358                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3359                         return
3360
3361                 self._downloader.increment_downloads()
3362
3363                 try:
3364                         self._downloader.process_info(info)
3365                 except UnavailableVideoError, err:
3366                         self._downloader.trouble(u'\nERROR: unable to download video')
3367
3368
3369 class XVideosIE(InfoExtractor):
3370         """Information extractor for xvideos.com"""
3371
3372         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3373         IE_NAME = u'xvideos'
3374
3375         def report_webpage(self, video_id):
3376                 """Report information extraction."""
3377                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3378
3379         def report_extraction(self, video_id):
3380                 """Report information extraction."""
3381                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3382
3383         def _real_extract(self, url):
3384                 htmlParser = HTMLParser.HTMLParser()
3385
3386                 mobj = re.match(self._VALID_URL, url)
3387                 if mobj is None:
3388                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3389                         return
3390                 video_id = mobj.group(1).decode('utf-8')
3391
3392                 self.report_webpage(video_id)
3393
3394                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3395                 try:
3396                         webpage = urllib2.urlopen(request).read()
3397                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3398                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3399                         return
3400
3401                 self.report_extraction(video_id)
3402
3403
3404                 # Extract video URL
3405                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3406                 if mobj is None:
3407                         self._downloader.trouble(u'ERROR: unable to extract video url')
3408                         return
3409                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3410
3411
3412                 # Extract title
3413                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3414                 if mobj is None:
3415                         self._downloader.trouble(u'ERROR: unable to extract video title')
3416                         return
3417                 video_title = mobj.group(1).decode('utf-8')
3418
3419
3420                 # Extract video thumbnail
3421                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3422                 if mobj is None:
3423                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3424                         return
3425                 video_thumbnail = mobj.group(1).decode('utf-8')
3426
3427
3428
3429                 self._downloader.increment_downloads()
3430                 info = {
3431                         'id': video_id,
3432                         'url': video_url,
3433                         'uploader': None,
3434                         'upload_date': None,
3435                         'title': video_title,
3436                         'stitle': _simplify_title(video_title),
3437                         'ext': 'flv',
3438                         'format': 'flv',
3439                         'thumbnail': video_thumbnail,
3440                         'description': None,
3441                         'player_url': None,
3442                 }
3443
3444                 try:
3445                         self._downloader.process_info(info)
3446                 except UnavailableVideoError, err:
3447                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3448
3449
3450 class SoundcloudIE(InfoExtractor):
3451         """Information extractor for soundcloud.com
3452            To access the media, the uid of the song and a stream token
3453            must be extracted from the page source and the script must make
3454            a request to media.soundcloud.com/crossdomain.xml. Then
3455            the media can be grabbed by requesting from an url composed
3456            of the stream token and uid
3457          """
3458
3459         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3460         IE_NAME = u'soundcloud'
3461
3462         def __init__(self, downloader=None):
3463                 InfoExtractor.__init__(self, downloader)
3464
3465         def report_webpage(self, video_id):
3466                 """Report information extraction."""
3467                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3468
3469         def report_extraction(self, video_id):
3470                 """Report information extraction."""
3471                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3472
3473         def _real_extract(self, url):
3474                 htmlParser = HTMLParser.HTMLParser()
3475
3476                 mobj = re.match(self._VALID_URL, url)
3477                 if mobj is None:
3478                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3479                         return
3480
3481                 # extract uploader (which is in the url)
3482                 uploader = mobj.group(1).decode('utf-8')
3483                 # extract simple title (uploader + slug of song title)
3484                 slug_title =  mobj.group(2).decode('utf-8')
3485                 simple_title = uploader + '-' + slug_title
3486
3487                 self.report_webpage('%s/%s' % (uploader, slug_title))
3488
3489                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3490                 try:
3491                         webpage = urllib2.urlopen(request).read()
3492                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3493                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3494                         return
3495
3496                 self.report_extraction('%s/%s' % (uploader, slug_title))
3497
3498                 # extract uid and stream token that soundcloud hands out for access
3499                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3500                 if mobj:
3501                         video_id = mobj.group(1)
3502                         stream_token = mobj.group(2)
3503
3504                 # extract unsimplified title
3505                 mobj = re.search('"title":"(.*?)",', webpage)
3506                 if mobj:
3507                         title = mobj.group(1)
3508
3509                 # construct media url (with uid/token)
3510                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3511                 mediaURL = mediaURL % (video_id, stream_token)
3512
3513                 # description
3514                 description = u'No description available'
3515                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3516                 if mobj:
3517                         description = mobj.group(1)
3518                 
3519                 # upload date
3520                 upload_date = None
3521                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3522                 if mobj:
3523                         try:
3524                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3525                         except Exception, e:
3526                                 print str(e)
3527
3528                 # for soundcloud, a request to a cross domain is required for cookies
3529                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3530
3531                 try:
3532                         self._downloader.process_info({
3533                                 'id':           video_id.decode('utf-8'),
3534                                 'url':          mediaURL,
3535                                 'uploader':     uploader.decode('utf-8'),
3536                                 'upload_date':  upload_date,
3537                                 'title':        simple_title.decode('utf-8'),
3538                                 'stitle':       simple_title.decode('utf-8'),
3539                                 'ext':          u'mp3',
3540                                 'format':       u'NA',
3541                                 'player_url':   None,
3542                                 'description': description.decode('utf-8')
3543                         })
3544                 except UnavailableVideoError:
3545                         self._downloader.trouble(u'\nERROR: unable to download video')
3546
3547
3548 class InfoQIE(InfoExtractor):
3549         """Information extractor for infoq.com"""
3550
3551         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3552         IE_NAME = u'infoq'
3553
3554         def report_webpage(self, video_id):
3555                 """Report information extraction."""
3556                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3557
3558         def report_extraction(self, video_id):
3559                 """Report information extraction."""
3560                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3561
3562         def _real_extract(self, url):
3563                 htmlParser = HTMLParser.HTMLParser()
3564
3565                 mobj = re.match(self._VALID_URL, url)
3566                 if mobj is None:
3567                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3568                         return
3569
3570                 self.report_webpage(url)
3571
3572                 request = urllib2.Request(url)
3573                 try:
3574                         webpage = urllib2.urlopen(request).read()
3575                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3576                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3577                         return
3578
3579                 self.report_extraction(url)
3580
3581
3582                 # Extract video URL
3583                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3584                 if mobj is None:
3585                         self._downloader.trouble(u'ERROR: unable to extract video url')
3586                         return
3587                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3588
3589
3590                 # Extract title
3591                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3592                 if mobj is None:
3593                         self._downloader.trouble(u'ERROR: unable to extract video title')
3594                         return
3595                 video_title = mobj.group(1).decode('utf-8')
3596
3597                 # Extract description
3598                 video_description = u'No description available.'
3599                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3600                 if mobj is not None:
3601                         video_description = mobj.group(1).decode('utf-8')
3602
3603                 video_filename = video_url.split('/')[-1]
3604                 video_id, extension = video_filename.split('.')
3605
3606                 self._downloader.increment_downloads()
3607                 info = {
3608                         'id': video_id,
3609                         'url': video_url,
3610                         'uploader': None,
3611                         'upload_date': None,
3612                         'title': video_title,
3613                         'stitle': _simplify_title(video_title),
3614                         'ext': extension,
3615                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3616                         'thumbnail': None,
3617                         'description': video_description,
3618                         'player_url': None,
3619                 }
3620
3621                 try:
3622                         self._downloader.process_info(info)
3623                 except UnavailableVideoError, err:
3624                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3625
3626 class MixcloudIE(InfoExtractor):
3627         """Information extractor for www.mixcloud.com"""
3628         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3629         IE_NAME = u'mixcloud'
3630
3631         def __init__(self, downloader=None):
3632                 InfoExtractor.__init__(self, downloader)
3633
3634         def report_download_json(self, file_id):
3635                 """Report JSON download."""
3636                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3637
3638         def report_extraction(self, file_id):
3639                 """Report information extraction."""
3640                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3641
3642         def get_urls(self, jsonData, fmt, bitrate='best'):
3643                 """Get urls from 'audio_formats' section in json"""
3644                 file_url = None
3645                 try:
3646                         bitrate_list = jsonData[fmt]
3647                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3648                                 bitrate = max(bitrate_list) # select highest
3649
3650                         url_list = jsonData[fmt][bitrate]
3651                 except TypeError: # we have no bitrate info.
3652                         url_list = jsonData[fmt]
3653                                 
3654                 return url_list
3655
3656         def check_urls(self, url_list):
3657                 """Returns 1st active url from list"""
3658                 for url in url_list:
3659                         try:
3660                                 urllib2.urlopen(url)
3661                                 return url
3662                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3663                                 url = None
3664
3665                 return None
3666
3667         def _print_formats(self, formats):
3668                 print 'Available formats:'
3669                 for fmt in formats.keys():
3670                         for b in formats[fmt]:
3671                                 try:
3672                                         ext = formats[fmt][b][0]
3673                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3674                                 except TypeError: # we have no bitrate info
3675                                         ext = formats[fmt][0]
3676                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3677                                         break
3678
3679         def _real_extract(self, url):
3680                 mobj = re.match(self._VALID_URL, url)
3681                 if mobj is None:
3682                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3683                         return
3684                 # extract uploader & filename from url
3685                 uploader = mobj.group(1).decode('utf-8')
3686                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3687
3688                 # construct API request
3689                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3690                 # retrieve .json file with links to files
3691                 request = urllib2.Request(file_url)
3692                 try:
3693                         self.report_download_json(file_url)
3694                         jsonData = urllib2.urlopen(request).read()
3695                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3696                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3697                         return
3698
3699                 # parse JSON
3700                 json_data = json.loads(jsonData)
3701                 player_url = json_data['player_swf_url']
3702                 formats = dict(json_data['audio_formats'])
3703
3704                 req_format = self._downloader.params.get('format', None)
3705                 bitrate = None
3706
3707                 if self._downloader.params.get('listformats', None):
3708                         self._print_formats(formats)
3709                         return
3710
3711                 if req_format is None or req_format == 'best':
3712                         for format_param in formats.keys():
3713                                 url_list = self.get_urls(formats, format_param)
3714                                 # check urls
3715                                 file_url = self.check_urls(url_list)
3716                                 if file_url is not None:
3717                                         break # got it!
3718                 else:
3719                         if req_format not in formats.keys():
3720                                 self._downloader.trouble(u'ERROR: format is not available')
3721                                 return
3722
3723                         url_list = self.get_urls(formats, req_format)
3724                         file_url = self.check_urls(url_list)
3725                         format_param = req_format
3726
3727                 # We have audio
3728                 self._downloader.increment_downloads()
3729                 try:
3730                         # Process file information
3731                         self._downloader.process_info({
3732                                 'id':           file_id.decode('utf-8'),
3733                                 'url':          file_url.decode('utf-8'),
3734                                 'uploader':     uploader.decode('utf-8'),
3735                                 'upload_date':  u'NA',
3736                                 'title':        json_data['name'],
3737                                 'stitle':       _simplify_title(json_data['name']),
3738                                 'ext':          file_url.split('.')[-1].decode('utf-8'),
3739                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3740                                 'thumbnail':    json_data['thumbnail_url'],
3741                                 'description':  json_data['description'],
3742                                 'player_url':   player_url.decode('utf-8'),
3743                         })
3744                 except UnavailableVideoError, err:
3745                         self._downloader.trouble(u'ERROR: unable to download file')
3746
3747
3748
3749 class PostProcessor(object):
3750         """Post Processor class.
3751
3752         PostProcessor objects can be added to downloaders with their
3753         add_post_processor() method. When the downloader has finished a
3754         successful download, it will take its internal chain of PostProcessors
3755         and start calling the run() method on each one of them, first with
3756         an initial argument and then with the returned value of the previous
3757         PostProcessor.
3758
3759         The chain will be stopped if one of them ever returns None or the end
3760         of the chain is reached.
3761
3762         PostProcessor objects follow a "mutual registration" process similar
3763         to InfoExtractor objects.
3764         """
3765
3766         _downloader = None
3767
3768         def __init__(self, downloader=None):
3769                 self._downloader = downloader
3770
3771         def set_downloader(self, downloader):
3772                 """Sets the downloader for this PP."""
3773                 self._downloader = downloader
3774
3775         def run(self, information):
3776                 """Run the PostProcessor.
3777
3778                 The "information" argument is a dictionary like the ones
3779                 composed by InfoExtractors. The only difference is that this
3780                 one has an extra field called "filepath" that points to the
3781                 downloaded file.
3782
3783                 When this method returns None, the postprocessing chain is
3784                 stopped. However, this method may return an information
3785                 dictionary that will be passed to the next postprocessing
3786                 object in the chain. It can be the one it received after
3787                 changing some fields.
3788
3789                 In addition, this method may raise a PostProcessingError
3790                 exception that will be taken into account by the downloader
3791                 it was called from.
3792                 """
3793                 return information # by default, do nothing
3794
3795
3796 class FFmpegExtractAudioPP(PostProcessor):
3797
3798         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3799                 PostProcessor.__init__(self, downloader)
3800                 if preferredcodec is None:
3801                         preferredcodec = 'best'
3802                 self._preferredcodec = preferredcodec
3803                 self._preferredquality = preferredquality
3804                 self._keepvideo = keepvideo
3805
3806         @staticmethod
3807         def get_audio_codec(path):
3808                 try:
3809                         cmd = ['ffprobe', '-show_streams', '--', path]
3810                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3811                         output = handle.communicate()[0]
3812                         if handle.wait() != 0:
3813                                 return None
3814                 except (IOError, OSError):
3815                         return None
3816                 audio_codec = None
3817                 for line in output.split('\n'):
3818                         if line.startswith('codec_name='):
3819                                 audio_codec = line.split('=')[1].strip()
3820                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3821                                 return audio_codec
3822                 return None
3823
3824         @staticmethod
3825         def run_ffmpeg(path, out_path, codec, more_opts):
3826                 try:
3827                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3828                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3829                         return (ret == 0)
3830                 except (IOError, OSError):
3831                         return False
3832
3833         def run(self, information):
3834                 path = information['filepath']
3835
3836                 filecodec = self.get_audio_codec(path)
3837                 if filecodec is None:
3838                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3839                         return None
3840
3841                 more_opts = []
3842                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3843                         if filecodec in ['aac', 'mp3', 'vorbis']:
3844                                 # Lossless if possible
3845                                 acodec = 'copy'
3846                                 extension = filecodec
3847                                 if filecodec == 'aac':
3848                                         more_opts = ['-f', 'adts']
3849                                 if filecodec == 'vorbis':
3850                                         extension = 'ogg'
3851                         else:
3852                                 # MP3 otherwise.
3853                                 acodec = 'libmp3lame'
3854                                 extension = 'mp3'
3855                                 more_opts = []
3856                                 if self._preferredquality is not None:
3857                                         more_opts += ['-ab', self._preferredquality]
3858                 else:
3859                         # We convert the audio (lossy)
3860                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3861                         extension = self._preferredcodec
3862                         more_opts = []
3863                         if self._preferredquality is not None:
3864                                 more_opts += ['-ab', self._preferredquality]
3865                         if self._preferredcodec == 'aac':
3866                                 more_opts += ['-f', 'adts']
3867                         if self._preferredcodec == 'vorbis':
3868                                 extension = 'ogg'
3869
3870                 (prefix, ext) = os.path.splitext(path)
3871                 new_path = prefix + '.' + extension
3872                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3873                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3874
3875                 if not status:
3876                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3877                         return None
3878
3879                 # Try to update the date time for extracted audio file.
3880                 if information.get('filetime') is not None:
3881                         try:
3882                                 os.utime(new_path, (time.time(), information['filetime']))
3883                         except:
3884                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3885
3886                 if not self._keepvideo:
3887                         try:
3888                                 os.remove(path)
3889                         except (IOError, OSError):
3890                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3891                                 return None
3892
3893                 information['filepath'] = new_path
3894                 return information
3895
3896
3897 def updateSelf(downloader, filename):
3898         ''' Update the program file with the latest version from the repository '''
3899         # Note: downloader only used for options
3900         if not os.access(filename, os.W_OK):
3901                 sys.exit('ERROR: no write permissions on %s' % filename)
3902
3903         downloader.to_screen('Updating to latest version...')
3904
3905         try:
3906                 try:
3907                         urlh = urllib.urlopen(UPDATE_URL)
3908                         newcontent = urlh.read()
3909                         
3910                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3911                         if vmatch is not None and vmatch.group(1) == __version__:
3912                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3913                                 return
3914                 finally:
3915                         urlh.close()
3916         except (IOError, OSError), err:
3917                 sys.exit('ERROR: unable to download latest version')
3918
3919         try:
3920                 outf = open(filename, 'wb')
3921                 try:
3922                         outf.write(newcontent)
3923                 finally:
3924                         outf.close()
3925         except (IOError, OSError), err:
3926                 sys.exit('ERROR: unable to overwrite current version')
3927
3928         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3929
3930 def parseOpts():
3931         # Deferred imports
3932         import getpass
3933         import optparse
3934         import shlex
3935
3936         def _readOptions(filename):
3937                 try:
3938                         optionf = open(filename)
3939                 except IOError:
3940                         return [] # silently skip if file is not present
3941                 try:
3942                         res = []
3943                         for l in optionf:
3944                                 res += shlex.split(l, comments=True)
3945                 finally:
3946                         optionf.close()
3947                 return res
3948
3949         def _format_option_string(option):
3950                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3951
3952                 opts = []
3953
3954                 if option._short_opts: opts.append(option._short_opts[0])
3955                 if option._long_opts: opts.append(option._long_opts[0])
3956                 if len(opts) > 1: opts.insert(1, ', ')
3957
3958                 if option.takes_value(): opts.append(' %s' % option.metavar)
3959
3960                 return "".join(opts)
3961
3962         def _find_term_columns():
3963                 columns = os.environ.get('COLUMNS', None)
3964                 if columns:
3965                         return int(columns)
3966
3967                 try:
3968                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3969                         out,err = sp.communicate()
3970                         return int(out.split()[1])
3971                 except:
3972                         pass
3973                 return None
3974
3975         max_width = 80
3976         max_help_position = 80
3977
3978         # No need to wrap help messages if we're on a wide console
3979         columns = _find_term_columns()
3980         if columns: max_width = columns
3981
3982         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3983         fmt.format_option_strings = _format_option_string
3984
3985         kw = {
3986                 'version'   : __version__,
3987                 'formatter' : fmt,
3988                 'usage' : '%prog [options] url [url...]',
3989                 'conflict_handler' : 'resolve',
3990         }
3991
3992         parser = optparse.OptionParser(**kw)
3993
3994         # option groups
3995         general        = optparse.OptionGroup(parser, 'General Options')
3996         selection      = optparse.OptionGroup(parser, 'Video Selection')
3997         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3998         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3999         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4000         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4001         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4002
4003         general.add_option('-h', '--help',
4004                         action='help', help='print this help text and exit')
4005         general.add_option('-v', '--version',
4006                         action='version', help='print program version and exit')
4007         general.add_option('-U', '--update',
4008                         action='store_true', dest='update_self', help='update this program to latest version')
4009         general.add_option('-i', '--ignore-errors',
4010                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4011         general.add_option('-r', '--rate-limit',
4012                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4013         general.add_option('-R', '--retries',
4014                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4015         general.add_option('--dump-user-agent',
4016                         action='store_true', dest='dump_user_agent',
4017                         help='display the current browser identification', default=False)
4018         general.add_option('--list-extractors',
4019                         action='store_true', dest='list_extractors',
4020                         help='List all supported extractors and the URLs they would handle', default=False)
4021
4022         selection.add_option('--playlist-start',
4023                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4024         selection.add_option('--playlist-end',
4025                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4026         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4027         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4028         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4029
4030         authentication.add_option('-u', '--username',
4031                         dest='username', metavar='USERNAME', help='account username')
4032         authentication.add_option('-p', '--password',
4033                         dest='password', metavar='PASSWORD', help='account password')
4034         authentication.add_option('-n', '--netrc',
4035                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4036
4037
4038         video_format.add_option('-f', '--format',
4039                         action='store', dest='format', metavar='FORMAT', help='video format code')
4040         video_format.add_option('--all-formats',
4041                         action='store_const', dest='format', help='download all available video formats', const='all')
4042         video_format.add_option('--max-quality',
4043                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4044         video_format.add_option('-F', '--list-formats',
4045                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4046
4047
4048         verbosity.add_option('-q', '--quiet',
4049                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4050         verbosity.add_option('-s', '--simulate',
4051                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4052         verbosity.add_option('--skip-download',
4053                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4054         verbosity.add_option('-g', '--get-url',
4055                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4056         verbosity.add_option('-e', '--get-title',
4057                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4058         verbosity.add_option('--get-thumbnail',
4059                         action='store_true', dest='getthumbnail',
4060                         help='simulate, quiet but print thumbnail URL', default=False)
4061         verbosity.add_option('--get-description',
4062                         action='store_true', dest='getdescription',
4063                         help='simulate, quiet but print video description', default=False)
4064         verbosity.add_option('--get-filename',
4065                         action='store_true', dest='getfilename',
4066                         help='simulate, quiet but print output filename', default=False)
4067         verbosity.add_option('--get-format',
4068                         action='store_true', dest='getformat',
4069                         help='simulate, quiet but print output format', default=False)
4070         verbosity.add_option('--no-progress',
4071                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4072         verbosity.add_option('--console-title',
4073                         action='store_true', dest='consoletitle',
4074                         help='display progress in console titlebar', default=False)
4075
4076
4077         filesystem.add_option('-t', '--title',
4078                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4079         filesystem.add_option('-l', '--literal',
4080                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4081         filesystem.add_option('-A', '--auto-number',
4082                         action='store_true', dest='autonumber',
4083                         help='number downloaded files starting from 00000', default=False)
4084         filesystem.add_option('-o', '--output',
4085                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4086         filesystem.add_option('-a', '--batch-file',
4087                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4088         filesystem.add_option('-w', '--no-overwrites',
4089                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4090         filesystem.add_option('-c', '--continue',
4091                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4092         filesystem.add_option('--no-continue',
4093                         action='store_false', dest='continue_dl',
4094                         help='do not resume partially downloaded files (restart from beginning)')
4095         filesystem.add_option('--cookies',
4096                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4097         filesystem.add_option('--no-part',
4098                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4099         filesystem.add_option('--no-mtime',
4100                         action='store_false', dest='updatetime',
4101                         help='do not use the Last-modified header to set the file modification time', default=True)
4102         filesystem.add_option('--write-description',
4103                         action='store_true', dest='writedescription',
4104                         help='write video description to a .description file', default=False)
4105         filesystem.add_option('--write-info-json',
4106                         action='store_true', dest='writeinfojson',
4107                         help='write video metadata to a .info.json file', default=False)
4108
4109
4110         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4111                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4112         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4113                         help='"best", "aac", "vorbis" or "mp3"; best by default')
4114         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4115                         help='ffmpeg audio bitrate specification, 128k by default')
4116         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4117                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4118
4119
4120         parser.add_option_group(general)
4121         parser.add_option_group(selection)
4122         parser.add_option_group(filesystem)
4123         parser.add_option_group(verbosity)
4124         parser.add_option_group(video_format)
4125         parser.add_option_group(authentication)
4126         parser.add_option_group(postproc)
4127
4128         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4129         if xdg_config_home:
4130                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4131         else:
4132                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4133         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4134         opts, args = parser.parse_args(argv)
4135
4136         return parser, opts, args
4137
4138 def gen_extractors():
4139         """ Return a list of an instance of every supported extractor.
4140         The order does matter; the first extractor matched is the one handling the URL.
4141         """
4142         youtube_ie = YoutubeIE()
4143         google_ie = GoogleIE()
4144         yahoo_ie = YahooIE()
4145         return [
4146                 YoutubePlaylistIE(youtube_ie),
4147                 YoutubeUserIE(youtube_ie),
4148                 YoutubeSearchIE(youtube_ie),
4149                 youtube_ie,
4150                 MetacafeIE(youtube_ie),
4151                 DailymotionIE(),
4152                 google_ie,
4153                 GoogleSearchIE(google_ie),
4154                 PhotobucketIE(),
4155                 yahoo_ie,
4156                 YahooSearchIE(yahoo_ie),
4157                 DepositFilesIE(),
4158                 FacebookIE(),
4159                 BlipTVIE(),
4160                 VimeoIE(),
4161                 MyVideoIE(),
4162                 ComedyCentralIE(),
4163                 EscapistIE(),
4164                 CollegeHumorIE(),
4165                 XVideosIE(),
4166                 SoundcloudIE(),
4167                 InfoQIE(),
4168                 MixcloudIE(),
4169
4170                 GenericIE()
4171         ]
4172
4173 def _real_main():
4174         parser, opts, args = parseOpts()
4175
4176         # Open appropriate CookieJar
4177         if opts.cookiefile is None:
4178                 jar = cookielib.CookieJar()
4179         else:
4180                 try:
4181                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4182                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4183                                 jar.load()
4184                 except (IOError, OSError), err:
4185                         sys.exit(u'ERROR: unable to open cookie file')
4186
4187         # Dump user agent
4188         if opts.dump_user_agent:
4189                 print std_headers['User-Agent']
4190                 sys.exit(0)
4191
4192         # Batch file verification
4193         batchurls = []
4194         if opts.batchfile is not None:
4195                 try:
4196                         if opts.batchfile == '-':
4197                                 batchfd = sys.stdin
4198                         else:
4199                                 batchfd = open(opts.batchfile, 'r')
4200                         batchurls = batchfd.readlines()
4201                         batchurls = [x.strip() for x in batchurls]
4202                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4203                 except IOError:
4204                         sys.exit(u'ERROR: batch file could not be read')
4205         all_urls = batchurls + args
4206
4207         # General configuration
4208         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4209         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4210         urllib2.install_opener(opener)
4211         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4212
4213         extractors = gen_extractors()
4214
4215         if opts.list_extractors:
4216                 for ie in extractors:
4217                         print(ie.IE_NAME)
4218                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4219                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4220                         for mu in matchedUrls:
4221                                 print(u'  ' + mu)
4222                 sys.exit(0)
4223
4224         # Conflicting, missing and erroneous options
4225         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4226                 parser.error(u'using .netrc conflicts with giving username/password')
4227         if opts.password is not None and opts.username is None:
4228                 parser.error(u'account username missing')
4229         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4230                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4231         if opts.usetitle and opts.useliteral:
4232                 parser.error(u'using title conflicts with using literal title')
4233         if opts.username is not None and opts.password is None:
4234                 opts.password = getpass.getpass(u'Type account password and press return:')
4235         if opts.ratelimit is not None:
4236                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4237                 if numeric_limit is None:
4238                         parser.error(u'invalid rate limit specified')
4239                 opts.ratelimit = numeric_limit
4240         if opts.retries is not None:
4241                 try:
4242                         opts.retries = long(opts.retries)
4243                 except (TypeError, ValueError), err:
4244                         parser.error(u'invalid retry count specified')
4245         try:
4246                 opts.playliststart = int(opts.playliststart)
4247                 if opts.playliststart <= 0:
4248                         raise ValueError(u'Playlist start must be positive')
4249         except (TypeError, ValueError), err:
4250                 parser.error(u'invalid playlist start number specified')
4251         try:
4252                 opts.playlistend = int(opts.playlistend)
4253                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4254                         raise ValueError(u'Playlist end must be greater than playlist start')
4255         except (TypeError, ValueError), err:
4256                 parser.error(u'invalid playlist end number specified')
4257         if opts.extractaudio:
4258                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4259                         parser.error(u'invalid audio format specified')
4260
4261         # File downloader
4262         fd = FileDownloader({
4263                 'usenetrc': opts.usenetrc,
4264                 'username': opts.username,
4265                 'password': opts.password,
4266                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4267                 'forceurl': opts.geturl,
4268                 'forcetitle': opts.gettitle,
4269                 'forcethumbnail': opts.getthumbnail,
4270                 'forcedescription': opts.getdescription,
4271                 'forcefilename': opts.getfilename,
4272                 'forceformat': opts.getformat,
4273                 'simulate': opts.simulate,
4274                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4275                 'format': opts.format,
4276                 'format_limit': opts.format_limit,
4277                 'listformats': opts.listformats,
4278                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4279                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4280                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4281                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4282                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4283                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4284                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4285                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4286                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4287                         or u'%(id)s.%(ext)s'),
4288                 'ignoreerrors': opts.ignoreerrors,
4289                 'ratelimit': opts.ratelimit,
4290                 'nooverwrites': opts.nooverwrites,
4291                 'retries': opts.retries,
4292                 'continuedl': opts.continue_dl,
4293                 'noprogress': opts.noprogress,
4294                 'playliststart': opts.playliststart,
4295                 'playlistend': opts.playlistend,
4296                 'logtostderr': opts.outtmpl == '-',
4297                 'consoletitle': opts.consoletitle,
4298                 'nopart': opts.nopart,
4299                 'updatetime': opts.updatetime,
4300                 'writedescription': opts.writedescription,
4301                 'writeinfojson': opts.writeinfojson,
4302                 'matchtitle': opts.matchtitle,
4303                 'rejecttitle': opts.rejecttitle,
4304                 'max_downloads': opts.max_downloads,
4305                 })
4306         for extractor in extractors:
4307                 fd.add_info_extractor(extractor)
4308
4309         # PostProcessors
4310         if opts.extractaudio:
4311                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4312
4313         # Update version
4314         if opts.update_self:
4315                 updateSelf(fd, sys.argv[0])
4316
4317         # Maybe do nothing
4318         if len(all_urls) < 1:
4319                 if not opts.update_self:
4320                         parser.error(u'you must provide at least one URL')
4321                 else:
4322                         sys.exit()
4323         retcode = fd.download(all_urls)
4324
4325         # Dump cookie jar if requested
4326         if opts.cookiefile is not None:
4327                 try:
4328                         jar.save()
4329                 except (IOError, OSError), err:
4330                         sys.exit(u'ERROR: unable to save cookie jar')
4331
4332         sys.exit(retcode)
4333
4334 def main():
4335         try:
4336                 _real_main()
4337         except DownloadError:
4338                 sys.exit(1)
4339         except SameFileError:
4340                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4341         except KeyboardInterrupt:
4342                 sys.exit(u'\nERROR: Interrupted by user')
4343
4344 if __name__ == '__main__':
4345         main()
4346
4347 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: