3fed554df0d3d7878bd728a2b85172cea2ef3356
[youtube-dl.git] / youtube_dl / __init__.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2011.11.23'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48         import ctypes
49
50 try:
51         import email.utils
52 except ImportError: # Python 2.4
53         import email.Utils
54 try:
55         import cStringIO as StringIO
56 except ImportError:
57         import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61         from urlparse import parse_qs
62 except ImportError:
63         from cgi import parse_qs
64
65 try:
66         import lxml.etree
67 except ImportError:
68         pass # Handled below
69
70 try:
71         import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79         'Accept-Encoding': 'gzip, deflate',
80         'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84         import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86         import re
87         class json(object):
88                 @staticmethod
89                 def loads(s):
90                         s = s.decode('UTF-8')
91                         def raiseError(msg, i):
92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93                         def skipSpace(i, expectMore=True):
94                                 while i < len(s) and s[i] in ' \t\r\n':
95                                         i += 1
96                                 if expectMore:
97                                         if i >= len(s):
98                                                 raiseError('Premature end', i)
99                                 return i
100                         def decodeEscape(match):
101                                 esc = match.group(1)
102                                 _STATIC = {
103                                         '"': '"',
104                                         '\\': '\\',
105                                         '/': '/',
106                                         'b': unichr(0x8),
107                                         'f': unichr(0xc),
108                                         'n': '\n',
109                                         'r': '\r',
110                                         't': '\t',
111                                 }
112                                 if esc in _STATIC:
113                                         return _STATIC[esc]
114                                 if esc[0] == 'u':
115                                         if len(esc) == 1+4:
116                                                 return unichr(int(esc[1:5], 16))
117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
118                                                 hi = int(esc[1:5], 16)
119                                                 low = int(esc[7:11], 16)
120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121                                 raise ValueError('Unknown escape ' + str(esc))
122                         def parseString(i):
123                                 i += 1
124                                 e = i
125                                 while True:
126                                         e = s.index('"', e)
127                                         bslashes = 0
128                                         while s[e-bslashes-1] == '\\':
129                                                 bslashes += 1
130                                         if bslashes % 2 == 1:
131                                                 e += 1
132                                                 continue
133                                         break
134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135                                 stri = rexp.sub(decodeEscape, s[i:e])
136                                 return (e+1,stri)
137                         def parseObj(i):
138                                 i += 1
139                                 res = {}
140                                 i = skipSpace(i)
141                                 if s[i] == '}': # Empty dictionary
142                                         return (i+1,res)
143                                 while True:
144                                         if s[i] != '"':
145                                                 raiseError('Expected a string object key', i)
146                                         i,key = parseString(i)
147                                         i = skipSpace(i)
148                                         if i >= len(s) or s[i] != ':':
149                                                 raiseError('Expected a colon', i)
150                                         i,val = parse(i+1)
151                                         res[key] = val
152                                         i = skipSpace(i)
153                                         if s[i] == '}':
154                                                 return (i+1, res)
155                                         if s[i] != ',':
156                                                 raiseError('Expected comma or closing curly brace', i)
157                                         i = skipSpace(i+1)
158                         def parseArray(i):
159                                 res = []
160                                 i = skipSpace(i+1)
161                                 if s[i] == ']': # Empty array
162                                         return (i+1,res)
163                                 while True:
164                                         i,val = parse(i)
165                                         res.append(val)
166                                         i = skipSpace(i) # Raise exception if premature end
167                                         if s[i] == ']':
168                                                 return (i+1, res)
169                                         if s[i] != ',':
170                                                 raiseError('Expected a comma or closing bracket', i)
171                                         i = skipSpace(i+1)
172                         def parseDiscrete(i):
173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
174                                         if s.startswith(k, i):
175                                                 return (i+len(k), v)
176                                 raiseError('Not a boolean (or null)', i)
177                         def parseNumber(i):
178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179                                 if mobj is None:
180                                         raiseError('Not a number', i)
181                                 nums = mobj.group(1)
182                                 if '.' in nums or 'e' in nums or 'E' in nums:
183                                         return (i+len(nums), float(nums))
184                                 return (i+len(nums), int(nums))
185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186                         def parse(i):
187                                 i = skipSpace(i)
188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
189                                 i = skipSpace(i, False)
190                                 return (i,res)
191                         i,res = parse(0)
192                         if i < len(s):
193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194                         return res
195
196 def preferredencoding():
197         """Get preferred encoding.
198
199         Returns the best encoding scheme for the system, based on
200         locale.getpreferredencoding() and some further tweaks.
201         """
202         def yield_preferredencoding():
203                 try:
204                         pref = locale.getpreferredencoding()
205                         u'TEST'.encode(pref)
206                 except:
207                         pref = 'UTF-8'
208                 while True:
209                         yield pref
210         return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214         """Transforms an HTML entity to a Unicode character.
215
216         This function receives a match object and is intended to be used with
217         the re.sub() function.
218         """
219         entity = matchobj.group(1)
220
221         # Known non-numeric HTML entity
222         if entity in htmlentitydefs.name2codepoint:
223                 return unichr(htmlentitydefs.name2codepoint[entity])
224
225         # Unicode character
226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
227         if mobj is not None:
228                 numstr = mobj.group(1)
229                 if numstr.startswith(u'x'):
230                         base = 16
231                         numstr = u'0%s' % numstr
232                 else:
233                         base = 10
234                 return unichr(long(numstr, base))
235
236         # Unknown entity in name, return its literal representation
237         return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241         """Sanitizes a video title so it could be used as part of a filename."""
242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243         return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247         """Try to open the given filename, and slightly tweak it if this fails.
248
249         Attempts to open the given filename. If this fails, it tries to change
250         the filename slightly, step by step, until it's either able to open it
251         or it fails and raises a final exception, like the standard open()
252         function.
253
254         It returns the tuple (stream, definitive_file_name).
255         """
256         try:
257                 if filename == u'-':
258                         if sys.platform == 'win32':
259                                 import msvcrt
260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261                         return (sys.stdout, filename)
262                 stream = open(filename, open_mode)
263                 return (stream, filename)
264         except (IOError, OSError), err:
265                 # In case of error, try to remove win32 forbidden chars
266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268                 # An exception here should be caught in the caller
269                 stream = open(filename, open_mode)
270                 return (stream, filename)
271
272
273 def timeconvert(timestr):
274         """Convert RFC 2822 defined time string into system timestamp"""
275         timestamp = None
276         timetuple = email.utils.parsedate_tz(timestr)
277         if timetuple is not None:
278                 timestamp = email.utils.mktime_tz(timetuple)
279         return timestamp
280
281 def _simplify_title(title):
282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283         return expr.sub(u'_', title).strip(u'_')
284
285 def _orderedSet(iterable):
286         """ Remove all duplicates from the input iterable """
287         res = []
288         for el in iterable:
289                 if el not in res:
290                         res.append(el)
291         return res
292
293 class DownloadError(Exception):
294         """Download Error exception.
295
296         This exception may be thrown by FileDownloader objects if they are not
297         configured to continue on errors. They will contain the appropriate
298         error message.
299         """
300         pass
301
302
303 class SameFileError(Exception):
304         """Same File exception.
305
306         This exception will be thrown by FileDownloader objects if they detect
307         multiple files would have to be downloaded to the same file on disk.
308         """
309         pass
310
311
312 class PostProcessingError(Exception):
313         """Post Processing exception.
314
315         This exception may be raised by PostProcessor's .run() method to
316         indicate an error in the postprocessing task.
317         """
318         pass
319
320 class MaxDownloadsReached(Exception):
321         """ --max-downloads limit has been reached. """
322         pass
323
324
325 class UnavailableVideoError(Exception):
326         """Unavailable Format exception.
327
328         This exception will be thrown when a video is requested
329         in a format that is not available for that video.
330         """
331         pass
332
333
334 class ContentTooShortError(Exception):
335         """Content Too Short exception.
336
337         This exception may be raised by FileDownloader objects when a file they
338         download is too small for what the server announced first, indicating
339         the connection was probably interrupted.
340         """
341         # Both in bytes
342         downloaded = None
343         expected = None
344
345         def __init__(self, downloaded, expected):
346                 self.downloaded = downloaded
347                 self.expected = expected
348
349
350 class YoutubeDLHandler(urllib2.HTTPHandler):
351         """Handler for HTTP requests and responses.
352
353         This class, when installed with an OpenerDirector, automatically adds
354         the standard headers to every HTTP request and handles gzipped and
355         deflated responses from web servers. If compression is to be avoided in
356         a particular request, the original request in the program code only has
357         to include the HTTP header "Youtubedl-No-Compression", which will be
358         removed before making the real request.
359
360         Part of this code was copied from:
361
362         http://techknack.net/python-urllib2-handlers/
363
364         Andrew Rowls, the author of that code, agreed to release it to the
365         public domain.
366         """
367
368         @staticmethod
369         def deflate(data):
370                 try:
371                         return zlib.decompress(data, -zlib.MAX_WBITS)
372                 except zlib.error:
373                         return zlib.decompress(data)
374
375         @staticmethod
376         def addinfourl_wrapper(stream, headers, url, code):
377                 if hasattr(urllib2.addinfourl, 'getcode'):
378                         return urllib2.addinfourl(stream, headers, url, code)
379                 ret = urllib2.addinfourl(stream, headers, url)
380                 ret.code = code
381                 return ret
382
383         def http_request(self, req):
384                 for h in std_headers:
385                         if h in req.headers:
386                                 del req.headers[h]
387                         req.add_header(h, std_headers[h])
388                 if 'Youtubedl-no-compression' in req.headers:
389                         if 'Accept-encoding' in req.headers:
390                                 del req.headers['Accept-encoding']
391                         del req.headers['Youtubedl-no-compression']
392                 return req
393
394         def http_response(self, req, resp):
395                 old_resp = resp
396                 # gzip
397                 if resp.headers.get('Content-encoding', '') == 'gzip':
398                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
399                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
400                         resp.msg = old_resp.msg
401                 # deflate
402                 if resp.headers.get('Content-encoding', '') == 'deflate':
403                         gz = StringIO.StringIO(self.deflate(resp.read()))
404                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
405                         resp.msg = old_resp.msg
406                 return resp
407
408
409 class FileDownloader(object):
410         """File Downloader class.
411
412         File downloader objects are the ones responsible of downloading the
413         actual video file and writing it to disk if the user has requested
414         it, among some other tasks. In most cases there should be one per
415         program. As, given a video URL, the downloader doesn't know how to
416         extract all the needed information, task that InfoExtractors do, it
417         has to pass the URL to one of them.
418
419         For this, file downloader objects have a method that allows
420         InfoExtractors to be registered in a given order. When it is passed
421         a URL, the file downloader handles it to the first InfoExtractor it
422         finds that reports being able to handle it. The InfoExtractor extracts
423         all the information about the video or videos the URL refers to, and
424         asks the FileDownloader to process the video information, possibly
425         downloading the video.
426
427         File downloaders accept a lot of parameters. In order not to saturate
428         the object constructor with arguments, it receives a dictionary of
429         options instead. These options are available through the params
430         attribute for the InfoExtractors to use. The FileDownloader also
431         registers itself as the downloader in charge for the InfoExtractors
432         that are added to it, so this is a "mutual registration".
433
434         Available options:
435
436         username:         Username for authentication purposes.
437         password:         Password for authentication purposes.
438         usenetrc:         Use netrc for authentication instead.
439         quiet:            Do not print messages to stdout.
440         forceurl:         Force printing final URL.
441         forcetitle:       Force printing title.
442         forcethumbnail:   Force printing thumbnail URL.
443         forcedescription: Force printing description.
444         forcefilename:    Force printing final filename.
445         simulate:         Do not download the video files.
446         format:           Video format code.
447         format_limit:     Highest quality format to try.
448         outtmpl:          Template for output names.
449         ignoreerrors:     Do not stop on download errors.
450         ratelimit:        Download speed limit, in bytes/sec.
451         nooverwrites:     Prevent overwriting files.
452         retries:          Number of times to retry for HTTP error 5xx
453         continuedl:       Try to continue downloads if possible.
454         noprogress:       Do not print the progress bar.
455         playliststart:    Playlist item to start at.
456         playlistend:      Playlist item to end at.
457         matchtitle:       Download only matching titles.
458         rejecttitle:      Reject downloads for matching titles.
459         logtostderr:      Log messages to stderr instead of stdout.
460         consoletitle:     Display progress in console window's titlebar.
461         nopart:           Do not use temporary .part files.
462         updatetime:       Use the Last-modified header to set output file timestamps.
463         writedescription: Write the video description to a .description file
464         writeinfojson:    Write the video description to a .info.json file
465         """
466
467         params = None
468         _ies = []
469         _pps = []
470         _download_retcode = None
471         _num_downloads = None
472         _screen_file = None
473
474         def __init__(self, params):
475                 """Create a FileDownloader object with the given options."""
476                 self._ies = []
477                 self._pps = []
478                 self._download_retcode = 0
479                 self._num_downloads = 0
480                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
481                 self.params = params
482
483         @staticmethod
484         def format_bytes(bytes):
485                 if bytes is None:
486                         return 'N/A'
487                 if type(bytes) is str:
488                         bytes = float(bytes)
489                 if bytes == 0.0:
490                         exponent = 0
491                 else:
492                         exponent = long(math.log(bytes, 1024.0))
493                 suffix = 'bkMGTPEZY'[exponent]
494                 converted = float(bytes) / float(1024 ** exponent)
495                 return '%.2f%s' % (converted, suffix)
496
497         @staticmethod
498         def calc_percent(byte_counter, data_len):
499                 if data_len is None:
500                         return '---.-%'
501                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
502
503         @staticmethod
504         def calc_eta(start, now, total, current):
505                 if total is None:
506                         return '--:--'
507                 dif = now - start
508                 if current == 0 or dif < 0.001: # One millisecond
509                         return '--:--'
510                 rate = float(current) / dif
511                 eta = long((float(total) - float(current)) / rate)
512                 (eta_mins, eta_secs) = divmod(eta, 60)
513                 if eta_mins > 99:
514                         return '--:--'
515                 return '%02d:%02d' % (eta_mins, eta_secs)
516
517         @staticmethod
518         def calc_speed(start, now, bytes):
519                 dif = now - start
520                 if bytes == 0 or dif < 0.001: # One millisecond
521                         return '%10s' % '---b/s'
522                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
523
524         @staticmethod
525         def best_block_size(elapsed_time, bytes):
526                 new_min = max(bytes / 2.0, 1.0)
527                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
528                 if elapsed_time < 0.001:
529                         return long(new_max)
530                 rate = bytes / elapsed_time
531                 if rate > new_max:
532                         return long(new_max)
533                 if rate < new_min:
534                         return long(new_min)
535                 return long(rate)
536
537         @staticmethod
538         def parse_bytes(bytestr):
539                 """Parse a string indicating a byte quantity into a long integer."""
540                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
541                 if matchobj is None:
542                         return None
543                 number = float(matchobj.group(1))
544                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
545                 return long(round(number * multiplier))
546
547         def add_info_extractor(self, ie):
548                 """Add an InfoExtractor object to the end of the list."""
549                 self._ies.append(ie)
550                 ie.set_downloader(self)
551
552         def add_post_processor(self, pp):
553                 """Add a PostProcessor object to the end of the chain."""
554                 self._pps.append(pp)
555                 pp.set_downloader(self)
556
557         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
558                 """Print message to stdout if not in quiet mode."""
559                 try:
560                         if not self.params.get('quiet', False):
561                                 terminator = [u'\n', u''][skip_eol]
562                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
563                         self._screen_file.flush()
564                 except (UnicodeEncodeError), err:
565                         if not ignore_encoding_errors:
566                                 raise
567
568         def to_stderr(self, message):
569                 """Print message to stderr."""
570                 print >>sys.stderr, message.encode(preferredencoding())
571
572         def to_cons_title(self, message):
573                 """Set console/terminal window title to message."""
574                 if not self.params.get('consoletitle', False):
575                         return
576                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
577                         # c_wchar_p() might not be necessary if `message` is
578                         # already of type unicode()
579                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
580                 elif 'TERM' in os.environ:
581                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
582
583         def fixed_template(self):
584                 """Checks if the output template is fixed."""
585                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
586
587         def trouble(self, message=None):
588                 """Determine action to take when a download problem appears.
589
590                 Depending on if the downloader has been configured to ignore
591                 download errors or not, this method may throw an exception or
592                 not when errors are found, after printing the message.
593                 """
594                 if message is not None:
595                         self.to_stderr(message)
596                 if not self.params.get('ignoreerrors', False):
597                         raise DownloadError(message)
598                 self._download_retcode = 1
599
600         def slow_down(self, start_time, byte_counter):
601                 """Sleep if the download speed is over the rate limit."""
602                 rate_limit = self.params.get('ratelimit', None)
603                 if rate_limit is None or byte_counter == 0:
604                         return
605                 now = time.time()
606                 elapsed = now - start_time
607                 if elapsed <= 0.0:
608                         return
609                 speed = float(byte_counter) / elapsed
610                 if speed > rate_limit:
611                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
612
613         def temp_name(self, filename):
614                 """Returns a temporary filename for the given filename."""
615                 if self.params.get('nopart', False) or filename == u'-' or \
616                                 (os.path.exists(filename) and not os.path.isfile(filename)):
617                         return filename
618                 return filename + u'.part'
619
620         def undo_temp_name(self, filename):
621                 if filename.endswith(u'.part'):
622                         return filename[:-len(u'.part')]
623                 return filename
624
625         def try_rename(self, old_filename, new_filename):
626                 try:
627                         if old_filename == new_filename:
628                                 return
629                         os.rename(old_filename, new_filename)
630                 except (IOError, OSError), err:
631                         self.trouble(u'ERROR: unable to rename file')
632
633         def try_utime(self, filename, last_modified_hdr):
634                 """Try to set the last-modified time of the given file."""
635                 if last_modified_hdr is None:
636                         return
637                 if not os.path.isfile(filename):
638                         return
639                 timestr = last_modified_hdr
640                 if timestr is None:
641                         return
642                 filetime = timeconvert(timestr)
643                 if filetime is None:
644                         return filetime
645                 try:
646                         os.utime(filename, (time.time(), filetime))
647                 except:
648                         pass
649                 return filetime
650
651         def report_writedescription(self, descfn):
652                 """ Report that the description file is being written """
653                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
654
655         def report_writeinfojson(self, infofn):
656                 """ Report that the metadata file has been written """
657                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
658
659         def report_destination(self, filename):
660                 """Report destination filename."""
661                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
662
663         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
664                 """Report download progress."""
665                 if self.params.get('noprogress', False):
666                         return
667                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
668                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
669                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
670                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
671
672         def report_resuming_byte(self, resume_len):
673                 """Report attempt to resume at given byte."""
674                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
675
676         def report_retry(self, count, retries):
677                 """Report retry in case of HTTP error 5xx"""
678                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
679
680         def report_file_already_downloaded(self, file_name):
681                 """Report file has already been fully downloaded."""
682                 try:
683                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
684                 except (UnicodeEncodeError), err:
685                         self.to_screen(u'[download] The file has already been downloaded')
686
687         def report_unable_to_resume(self):
688                 """Report it was impossible to resume download."""
689                 self.to_screen(u'[download] Unable to resume')
690
691         def report_finish(self):
692                 """Report download finished."""
693                 if self.params.get('noprogress', False):
694                         self.to_screen(u'[download] Download completed')
695                 else:
696                         self.to_screen(u'')
697
698         def increment_downloads(self):
699                 """Increment the ordinal that assigns a number to each file."""
700                 self._num_downloads += 1
701
702         def prepare_filename(self, info_dict):
703                 """Generate the output filename."""
704                 try:
705                         template_dict = dict(info_dict)
706                         template_dict['epoch'] = unicode(long(time.time()))
707                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
708                         filename = self.params['outtmpl'] % template_dict
709                         return filename
710                 except (ValueError, KeyError), err:
711                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
712                         return None
713
714         def _match_entry(self, info_dict):
715                 """ Returns None iff the file should be downloaded """
716
717                 title = info_dict['title']
718                 matchtitle = self.params.get('matchtitle', False)
719                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
720                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
721                 rejecttitle = self.params.get('rejecttitle', False)
722                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
723                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
724                 return None
725
726         def process_info(self, info_dict):
727                 """Process a single dictionary returned by an InfoExtractor."""
728
729                 reason = self._match_entry(info_dict)
730                 if reason is not None:
731                         self.to_screen(u'[download] ' + reason)
732                         return
733
734                 max_downloads = self.params.get('max_downloads')
735                 if max_downloads is not None:
736                         if self._num_downloads > int(max_downloads):
737                                 raise MaxDownloadsReached()
738
739                 filename = self.prepare_filename(info_dict)
740                 
741                 # Forced printings
742                 if self.params.get('forcetitle', False):
743                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
744                 if self.params.get('forceurl', False):
745                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
746                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
747                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
748                 if self.params.get('forcedescription', False) and 'description' in info_dict:
749                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
750                 if self.params.get('forcefilename', False) and filename is not None:
751                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
752                 if self.params.get('forceformat', False):
753                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
754
755                 # Do nothing else if in simulate mode
756                 if self.params.get('simulate', False):
757                         return
758
759                 if filename is None:
760                         return
761
762                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
763                         self.to_stderr(u'WARNING: file exists and will be skipped')
764                         return
765
766                 try:
767                         dn = os.path.dirname(filename)
768                         if dn != '' and not os.path.exists(dn):
769                                 os.makedirs(dn)
770                 except (OSError, IOError), err:
771                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
772                         return
773
774                 if self.params.get('writedescription', False):
775                         try:
776                                 descfn = filename + '.description'
777                                 self.report_writedescription(descfn)
778                                 descfile = open(descfn, 'wb')
779                                 try:
780                                         descfile.write(info_dict['description'].encode('utf-8'))
781                                 finally:
782                                         descfile.close()
783                         except (OSError, IOError):
784                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
785                                 return
786
787                 if self.params.get('writeinfojson', False):
788                         infofn = filename + '.info.json'
789                         self.report_writeinfojson(infofn)
790                         try:
791                                 json.dump
792                         except (NameError,AttributeError):
793                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
794                                 return
795                         try:
796                                 infof = open(infofn, 'wb')
797                                 try:
798                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
799                                         json.dump(json_info_dict, infof)
800                                 finally:
801                                         infof.close()
802                         except (OSError, IOError):
803                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
804                                 return
805
806                 if not self.params.get('skip_download', False):
807                         try:
808                                 success = self._do_download(filename, info_dict)
809                         except (OSError, IOError), err:
810                                 raise UnavailableVideoError
811                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
812                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
813                                 return
814                         except (ContentTooShortError, ), err:
815                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
816                                 return
817         
818                         if success:
819                                 try:
820                                         self.post_process(filename, info_dict)
821                                 except (PostProcessingError), err:
822                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
823                                         return
824
825         def download(self, url_list):
826                 """Download a given list of URLs."""
827                 if len(url_list) > 1 and self.fixed_template():
828                         raise SameFileError(self.params['outtmpl'])
829
830                 for url in url_list:
831                         suitable_found = False
832                         for ie in self._ies:
833                                 # Go to next InfoExtractor if not suitable
834                                 if not ie.suitable(url):
835                                         continue
836
837                                 # Suitable InfoExtractor found
838                                 suitable_found = True
839
840                                 # Extract information from URL and process it
841                                 ie.extract(url)
842
843                                 # Suitable InfoExtractor had been found; go to next URL
844                                 break
845
846                         if not suitable_found:
847                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
848
849                 return self._download_retcode
850
851         def post_process(self, filename, ie_info):
852                 """Run the postprocessing chain on the given file."""
853                 info = dict(ie_info)
854                 info['filepath'] = filename
855                 for pp in self._pps:
856                         info = pp.run(info)
857                         if info is None:
858                                 break
859
860         def _download_with_rtmpdump(self, filename, url, player_url):
861                 self.report_destination(filename)
862                 tmpfilename = self.temp_name(filename)
863
864                 # Check for rtmpdump first
865                 try:
866                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
867                 except (OSError, IOError):
868                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
869                         return False
870
871                 # Download using rtmpdump. rtmpdump returns exit code 2 when
872                 # the connection was interrumpted and resuming appears to be
873                 # possible. This is part of rtmpdump's normal usage, AFAIK.
874                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
875                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
876                 while retval == 2 or retval == 1:
877                         prevsize = os.path.getsize(tmpfilename)
878                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
879                         time.sleep(5.0) # This seems to be needed
880                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
881                         cursize = os.path.getsize(tmpfilename)
882                         if prevsize == cursize and retval == 1:
883                                 break
884                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
885                         if prevsize == cursize and retval == 2 and cursize > 1024:
886                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
887                                 retval = 0
888                                 break
889                 if retval == 0:
890                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
891                         self.try_rename(tmpfilename, filename)
892                         return True
893                 else:
894                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
895                         return False
896
897         def _do_download(self, filename, info_dict):
898                 url = info_dict['url']
899                 player_url = info_dict.get('player_url', None)
900
901                 # Check file already present
902                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
903                         self.report_file_already_downloaded(filename)
904                         return True
905
906                 # Attempt to download using rtmpdump
907                 if url.startswith('rtmp'):
908                         return self._download_with_rtmpdump(filename, url, player_url)
909
910                 tmpfilename = self.temp_name(filename)
911                 stream = None
912
913                 # Do not include the Accept-Encoding header
914                 headers = {'Youtubedl-no-compression': 'True'}
915                 basic_request = urllib2.Request(url, None, headers)
916                 request = urllib2.Request(url, None, headers)
917
918                 # Establish possible resume length
919                 if os.path.isfile(tmpfilename):
920                         resume_len = os.path.getsize(tmpfilename)
921                 else:
922                         resume_len = 0
923
924                 open_mode = 'wb'
925                 if resume_len != 0:
926                         if self.params.get('continuedl', False):
927                                 self.report_resuming_byte(resume_len)
928                                 request.add_header('Range','bytes=%d-' % resume_len)
929                                 open_mode = 'ab'
930                         else:
931                                 resume_len = 0
932
933                 count = 0
934                 retries = self.params.get('retries', 0)
935                 while count <= retries:
936                         # Establish connection
937                         try:
938                                 if count == 0 and 'urlhandle' in info_dict:
939                                         data = info_dict['urlhandle']
940                                 data = urllib2.urlopen(request)
941                                 break
942                         except (urllib2.HTTPError, ), err:
943                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
944                                         # Unexpected HTTP error
945                                         raise
946                                 elif err.code == 416:
947                                         # Unable to resume (requested range not satisfiable)
948                                         try:
949                                                 # Open the connection again without the range header
950                                                 data = urllib2.urlopen(basic_request)
951                                                 content_length = data.info()['Content-Length']
952                                         except (urllib2.HTTPError, ), err:
953                                                 if err.code < 500 or err.code >= 600:
954                                                         raise
955                                         else:
956                                                 # Examine the reported length
957                                                 if (content_length is not None and
958                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
959                                                         # The file had already been fully downloaded.
960                                                         # Explanation to the above condition: in issue #175 it was revealed that
961                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
962                                                         # changing the file size slightly and causing problems for some users. So
963                                                         # I decided to implement a suggested change and consider the file
964                                                         # completely downloaded if the file size differs less than 100 bytes from
965                                                         # the one in the hard drive.
966                                                         self.report_file_already_downloaded(filename)
967                                                         self.try_rename(tmpfilename, filename)
968                                                         return True
969                                                 else:
970                                                         # The length does not match, we start the download over
971                                                         self.report_unable_to_resume()
972                                                         open_mode = 'wb'
973                                                         break
974                         # Retry
975                         count += 1
976                         if count <= retries:
977                                 self.report_retry(count, retries)
978
979                 if count > retries:
980                         self.trouble(u'ERROR: giving up after %s retries' % retries)
981                         return False
982
983                 data_len = data.info().get('Content-length', None)
984                 if data_len is not None:
985                         data_len = long(data_len) + resume_len
986                 data_len_str = self.format_bytes(data_len)
987                 byte_counter = 0 + resume_len
988                 block_size = 1024
989                 start = time.time()
990                 while True:
991                         # Download and write
992                         before = time.time()
993                         data_block = data.read(block_size)
994                         after = time.time()
995                         if len(data_block) == 0:
996                                 break
997                         byte_counter += len(data_block)
998
999                         # Open file just in time
1000                         if stream is None:
1001                                 try:
1002                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1003                                         assert stream is not None
1004                                         filename = self.undo_temp_name(tmpfilename)
1005                                         self.report_destination(filename)
1006                                 except (OSError, IOError), err:
1007                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1008                                         return False
1009                         try:
1010                                 stream.write(data_block)
1011                         except (IOError, OSError), err:
1012                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1013                                 return False
1014                         block_size = self.best_block_size(after - before, len(data_block))
1015
1016                         # Progress message
1017                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1018                         if data_len is None:
1019                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1020                         else:
1021                                 percent_str = self.calc_percent(byte_counter, data_len)
1022                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1023                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1024
1025                         # Apply rate limit
1026                         self.slow_down(start, byte_counter - resume_len)
1027
1028                 if stream is None:
1029                         self.trouble(u'\nERROR: Did not get any data blocks')
1030                         return False
1031                 stream.close()
1032                 self.report_finish()
1033                 if data_len is not None and byte_counter != data_len:
1034                         raise ContentTooShortError(byte_counter, long(data_len))
1035                 self.try_rename(tmpfilename, filename)
1036
1037                 # Update file modification time
1038                 if self.params.get('updatetime', True):
1039                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1040
1041                 return True
1042
1043
1044 class InfoExtractor(object):
1045         """Information Extractor class.
1046
1047         Information extractors are the classes that, given a URL, extract
1048         information from the video (or videos) the URL refers to. This
1049         information includes the real video URL, the video title and simplified
1050         title, author and others. The information is stored in a dictionary
1051         which is then passed to the FileDownloader. The FileDownloader
1052         processes this information possibly downloading the video to the file
1053         system, among other possible outcomes. The dictionaries must include
1054         the following fields:
1055
1056         id:             Video identifier.
1057         url:            Final video URL.
1058         uploader:       Nickname of the video uploader.
1059         title:          Literal title.
1060         stitle:         Simplified title.
1061         ext:            Video filename extension.
1062         format:         Video format.
1063         player_url:     SWF Player URL (may be None).
1064
1065         The following fields are optional. Their primary purpose is to allow
1066         youtube-dl to serve as the backend for a video search function, such
1067         as the one in youtube2mp3.  They are only used when their respective
1068         forced printing functions are called:
1069
1070         thumbnail:      Full URL to a video thumbnail image.
1071         description:    One-line video description.
1072
1073         Subclasses of this one should re-define the _real_initialize() and
1074         _real_extract() methods and define a _VALID_URL regexp.
1075         Probably, they should also be added to the list of extractors.
1076         """
1077
1078         _ready = False
1079         _downloader = None
1080
1081         def __init__(self, downloader=None):
1082                 """Constructor. Receives an optional downloader."""
1083                 self._ready = False
1084                 self.set_downloader(downloader)
1085
1086         def suitable(self, url):
1087                 """Receives a URL and returns True if suitable for this IE."""
1088                 return re.match(self._VALID_URL, url) is not None
1089
1090         def initialize(self):
1091                 """Initializes an instance (authentication, etc)."""
1092                 if not self._ready:
1093                         self._real_initialize()
1094                         self._ready = True
1095
1096         def extract(self, url):
1097                 """Extracts URL information and returns it in list of dicts."""
1098                 self.initialize()
1099                 return self._real_extract(url)
1100
1101         def set_downloader(self, downloader):
1102                 """Sets the downloader for this IE."""
1103                 self._downloader = downloader
1104
1105         def _real_initialize(self):
1106                 """Real initialization process. Redefine in subclasses."""
1107                 pass
1108
1109         def _real_extract(self, url):
1110                 """Real extraction process. Redefine in subclasses."""
1111                 pass
1112
1113
1114 class YoutubeIE(InfoExtractor):
1115         """Information extractor for youtube.com."""
1116
1117         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1118         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1119         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1120         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1121         _NETRC_MACHINE = 'youtube'
1122         # Listed in order of quality
1123         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1124         _video_extensions = {
1125                 '13': '3gp',
1126                 '17': 'mp4',
1127                 '18': 'mp4',
1128                 '22': 'mp4',
1129                 '37': 'mp4',
1130                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1131                 '43': 'webm',
1132                 '44': 'webm',
1133                 '45': 'webm',
1134         }
1135         _video_dimensions = {
1136                 '5': '240x400',
1137                 '6': '???',
1138                 '13': '???',
1139                 '17': '144x176',
1140                 '18': '360x640',
1141                 '22': '720x1280',
1142                 '34': '360x640',
1143                 '35': '480x854',
1144                 '37': '1080x1920',
1145                 '38': '3072x4096',
1146                 '43': '360x640',
1147                 '44': '480x854',
1148                 '45': '720x1280',
1149         }       
1150         IE_NAME = u'youtube'
1151
1152         def report_lang(self):
1153                 """Report attempt to set language."""
1154                 self._downloader.to_screen(u'[youtube] Setting language')
1155
1156         def report_login(self):
1157                 """Report attempt to log in."""
1158                 self._downloader.to_screen(u'[youtube] Logging in')
1159
1160         def report_age_confirmation(self):
1161                 """Report attempt to confirm age."""
1162                 self._downloader.to_screen(u'[youtube] Confirming age')
1163
1164         def report_video_webpage_download(self, video_id):
1165                 """Report attempt to download video webpage."""
1166                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1167
1168         def report_video_info_webpage_download(self, video_id):
1169                 """Report attempt to download video info webpage."""
1170                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1171
1172         def report_information_extraction(self, video_id):
1173                 """Report attempt to extract video information."""
1174                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1175
1176         def report_unavailable_format(self, video_id, format):
1177                 """Report extracted video URL."""
1178                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1179
1180         def report_rtmp_download(self):
1181                 """Indicate the download will use the RTMP protocol."""
1182                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1183
1184         def _print_formats(self, formats):
1185                 print 'Available formats:'
1186                 for x in formats:
1187                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1188
1189         def _real_initialize(self):
1190                 if self._downloader is None:
1191                         return
1192
1193                 username = None
1194                 password = None
1195                 downloader_params = self._downloader.params
1196
1197                 # Attempt to use provided username and password or .netrc data
1198                 if downloader_params.get('username', None) is not None:
1199                         username = downloader_params['username']
1200                         password = downloader_params['password']
1201                 elif downloader_params.get('usenetrc', False):
1202                         try:
1203                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1204                                 if info is not None:
1205                                         username = info[0]
1206                                         password = info[2]
1207                                 else:
1208                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1209                         except (IOError, netrc.NetrcParseError), err:
1210                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1211                                 return
1212
1213                 # Set language
1214                 request = urllib2.Request(self._LANG_URL)
1215                 try:
1216                         self.report_lang()
1217                         urllib2.urlopen(request).read()
1218                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1220                         return
1221
1222                 # No authentication to be performed
1223                 if username is None:
1224                         return
1225
1226                 # Log in
1227                 login_form = {
1228                                 'current_form': 'loginForm',
1229                                 'next':         '/',
1230                                 'action_login': 'Log In',
1231                                 'username':     username,
1232                                 'password':     password,
1233                                 }
1234                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1235                 try:
1236                         self.report_login()
1237                         login_results = urllib2.urlopen(request).read()
1238                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1239                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1240                                 return
1241                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1242                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1243                         return
1244
1245                 # Confirm age
1246                 age_form = {
1247                                 'next_url':             '/',
1248                                 'action_confirm':       'Confirm',
1249                                 }
1250                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1251                 try:
1252                         self.report_age_confirmation()
1253                         age_results = urllib2.urlopen(request).read()
1254                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1255                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1256                         return
1257
1258         def _real_extract(self, url):
1259                 # Extract video id from URL
1260                 mobj = re.match(self._VALID_URL, url)
1261                 if mobj is None:
1262                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1263                         return
1264                 video_id = mobj.group(2)
1265
1266                 # Get video webpage
1267                 self.report_video_webpage_download(video_id)
1268                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1269                 try:
1270                         video_webpage = urllib2.urlopen(request).read()
1271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1272                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1273                         return
1274
1275                 # Attempt to extract SWF player URL
1276                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1277                 if mobj is not None:
1278                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1279                 else:
1280                         player_url = None
1281
1282                 # Get video info
1283                 self.report_video_info_webpage_download(video_id)
1284                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1285                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1286                                         % (video_id, el_type))
1287                         request = urllib2.Request(video_info_url)
1288                         try:
1289                                 video_info_webpage = urllib2.urlopen(request).read()
1290                                 video_info = parse_qs(video_info_webpage)
1291                                 if 'token' in video_info:
1292                                         break
1293                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1294                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1295                                 return
1296                 if 'token' not in video_info:
1297                         if 'reason' in video_info:
1298                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1299                         else:
1300                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1301                         return
1302
1303                 # Start extracting information
1304                 self.report_information_extraction(video_id)
1305
1306                 # uploader
1307                 if 'author' not in video_info:
1308                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1309                         return
1310                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1311
1312                 # title
1313                 if 'title' not in video_info:
1314                         self._downloader.trouble(u'ERROR: unable to extract video title')
1315                         return
1316                 video_title = urllib.unquote_plus(video_info['title'][0])
1317                 video_title = video_title.decode('utf-8')
1318                 video_title = sanitize_title(video_title)
1319
1320                 # simplified title
1321                 simple_title = _simplify_title(video_title)
1322
1323                 # thumbnail image
1324                 if 'thumbnail_url' not in video_info:
1325                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1326                         video_thumbnail = ''
1327                 else:   # don't panic if we can't find it
1328                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1329
1330                 # upload date
1331                 upload_date = u'NA'
1332                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1333                 if mobj is not None:
1334                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1335                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1336                         for expression in format_expressions:
1337                                 try:
1338                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1339                                 except:
1340                                         pass
1341
1342                 # description
1343                 try:
1344                         lxml.etree
1345                 except NameError:
1346                         video_description = u'No description available.'
1347                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1348                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1349                                 if mobj is not None:
1350                                         video_description = mobj.group(1).decode('utf-8')
1351                 else:
1352                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1353                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1354                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1355                         # TODO use another parser
1356
1357                 # token
1358                 video_token = urllib.unquote_plus(video_info['token'][0])
1359
1360                 # Decide which formats to download
1361                 req_format = self._downloader.params.get('format', None)
1362
1363                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1364                         self.report_rtmp_download()
1365                         video_url_list = [(None, video_info['conn'][0])]
1366                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1367                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1368                         url_data = [parse_qs(uds) for uds in url_data_strs]
1369                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1370                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1371
1372                         format_limit = self._downloader.params.get('format_limit', None)
1373                         if format_limit is not None and format_limit in self._available_formats:
1374                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1375                         else:
1376                                 format_list = self._available_formats
1377                         existing_formats = [x for x in format_list if x in url_map]
1378                         if len(existing_formats) == 0:
1379                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1380                                 return
1381                         if self._downloader.params.get('listformats', None):
1382                                 self._print_formats(existing_formats)
1383                                 return
1384                         if req_format is None or req_format == 'best':
1385                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1386                         elif req_format == 'worst':
1387                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1388                         elif req_format in ('-1', 'all'):
1389                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1390                         else:
1391                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1392                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1393                                 req_formats = req_format.split('/')
1394                                 video_url_list = None
1395                                 for rf in req_formats:
1396                                         if rf in url_map:
1397                                                 video_url_list = [(rf, url_map[rf])]
1398                                                 break
1399                                 if video_url_list is None:
1400                                         self._downloader.trouble(u'ERROR: requested format not available')
1401                                         return
1402                 else:
1403                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1404                         return
1405
1406                 for format_param, video_real_url in video_url_list:
1407                         # At this point we have a new video
1408                         self._downloader.increment_downloads()
1409
1410                         # Extension
1411                         video_extension = self._video_extensions.get(format_param, 'flv')
1412
1413                         try:
1414                                 # Process video information
1415                                 self._downloader.process_info({
1416                                         'id':           video_id.decode('utf-8'),
1417                                         'url':          video_real_url.decode('utf-8'),
1418                                         'uploader':     video_uploader.decode('utf-8'),
1419                                         'upload_date':  upload_date,
1420                                         'title':        video_title,
1421                                         'stitle':       simple_title,
1422                                         'ext':          video_extension.decode('utf-8'),
1423                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1424                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1425                                         'description':  video_description,
1426                                         'player_url':   player_url,
1427                                 })
1428                         except UnavailableVideoError, err:
1429                                 self._downloader.trouble(u'\nERROR: unable to download video')
1430
1431
1432 class MetacafeIE(InfoExtractor):
1433         """Information Extractor for metacafe.com."""
1434
1435         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1436         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1437         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1438         _youtube_ie = None
1439         IE_NAME = u'metacafe'
1440
1441         def __init__(self, youtube_ie, downloader=None):
1442                 InfoExtractor.__init__(self, downloader)
1443                 self._youtube_ie = youtube_ie
1444
1445         def report_disclaimer(self):
1446                 """Report disclaimer retrieval."""
1447                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1448
1449         def report_age_confirmation(self):
1450                 """Report attempt to confirm age."""
1451                 self._downloader.to_screen(u'[metacafe] Confirming age')
1452
1453         def report_download_webpage(self, video_id):
1454                 """Report webpage download."""
1455                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1456
1457         def report_extraction(self, video_id):
1458                 """Report information extraction."""
1459                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1460
1461         def _real_initialize(self):
1462                 # Retrieve disclaimer
1463                 request = urllib2.Request(self._DISCLAIMER)
1464                 try:
1465                         self.report_disclaimer()
1466                         disclaimer = urllib2.urlopen(request).read()
1467                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1468                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1469                         return
1470
1471                 # Confirm age
1472                 disclaimer_form = {
1473                         'filters': '0',
1474                         'submit': "Continue - I'm over 18",
1475                         }
1476                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1477                 try:
1478                         self.report_age_confirmation()
1479                         disclaimer = urllib2.urlopen(request).read()
1480                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1481                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1482                         return
1483
1484         def _real_extract(self, url):
1485                 # Extract id and simplified title from URL
1486                 mobj = re.match(self._VALID_URL, url)
1487                 if mobj is None:
1488                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1489                         return
1490
1491                 video_id = mobj.group(1)
1492
1493                 # Check if video comes from YouTube
1494                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1495                 if mobj2 is not None:
1496                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1497                         return
1498
1499                 # At this point we have a new video
1500                 self._downloader.increment_downloads()
1501
1502                 simple_title = mobj.group(2).decode('utf-8')
1503
1504                 # Retrieve video webpage to extract further information
1505                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1506                 try:
1507                         self.report_download_webpage(video_id)
1508                         webpage = urllib2.urlopen(request).read()
1509                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1510                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1511                         return
1512
1513                 # Extract URL, uploader and title from webpage
1514                 self.report_extraction(video_id)
1515                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1516                 if mobj is not None:
1517                         mediaURL = urllib.unquote(mobj.group(1))
1518                         video_extension = mediaURL[-3:]
1519
1520                         # Extract gdaKey if available
1521                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1522                         if mobj is None:
1523                                 video_url = mediaURL
1524                         else:
1525                                 gdaKey = mobj.group(1)
1526                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1527                 else:
1528                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1529                         if mobj is None:
1530                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1531                                 return
1532                         vardict = parse_qs(mobj.group(1))
1533                         if 'mediaData' not in vardict:
1534                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1535                                 return
1536                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1537                         if mobj is None:
1538                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1539                                 return
1540                         mediaURL = mobj.group(1).replace('\\/', '/')
1541                         video_extension = mediaURL[-3:]
1542                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1543
1544                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1545                 if mobj is None:
1546                         self._downloader.trouble(u'ERROR: unable to extract title')
1547                         return
1548                 video_title = mobj.group(1).decode('utf-8')
1549                 video_title = sanitize_title(video_title)
1550
1551                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1552                 if mobj is None:
1553                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1554                         return
1555                 video_uploader = mobj.group(1)
1556
1557                 try:
1558                         # Process video information
1559                         self._downloader.process_info({
1560                                 'id':           video_id.decode('utf-8'),
1561                                 'url':          video_url.decode('utf-8'),
1562                                 'uploader':     video_uploader.decode('utf-8'),
1563                                 'upload_date':  u'NA',
1564                                 'title':        video_title,
1565                                 'stitle':       simple_title,
1566                                 'ext':          video_extension.decode('utf-8'),
1567                                 'format':       u'NA',
1568                                 'player_url':   None,
1569                         })
1570                 except UnavailableVideoError:
1571                         self._downloader.trouble(u'\nERROR: unable to download video')
1572
1573
1574 class DailymotionIE(InfoExtractor):
1575         """Information Extractor for Dailymotion"""
1576
1577         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1578         IE_NAME = u'dailymotion'
1579
1580         def __init__(self, downloader=None):
1581                 InfoExtractor.__init__(self, downloader)
1582
1583         def report_download_webpage(self, video_id):
1584                 """Report webpage download."""
1585                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1586
1587         def report_extraction(self, video_id):
1588                 """Report information extraction."""
1589                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1590
1591         def _real_extract(self, url):
1592                 # Extract id and simplified title from URL
1593                 mobj = re.match(self._VALID_URL, url)
1594                 if mobj is None:
1595                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1596                         return
1597
1598                 # At this point we have a new video
1599                 self._downloader.increment_downloads()
1600                 video_id = mobj.group(1)
1601
1602                 simple_title = mobj.group(2).decode('utf-8')
1603                 video_extension = 'flv'
1604
1605                 # Retrieve video webpage to extract further information
1606                 request = urllib2.Request(url)
1607                 request.add_header('Cookie', 'family_filter=off')
1608                 try:
1609                         self.report_download_webpage(video_id)
1610                         webpage = urllib2.urlopen(request).read()
1611                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1613                         return
1614
1615                 # Extract URL, uploader and title from webpage
1616                 self.report_extraction(video_id)
1617                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1618                 if mobj is None:
1619                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1620                         return
1621                 sequence = urllib.unquote(mobj.group(1))
1622                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1623                 if mobj is None:
1624                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1625                         return
1626                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1627
1628                 # if needed add http://www.dailymotion.com/ if relative URL
1629
1630                 video_url = mediaURL
1631
1632                 mobj = re.search(r'(?im)<title>\s*(.+)\s*-\s*Video\s+Dailymotion</title>', webpage)
1633                 if mobj is None:
1634                         self._downloader.trouble(u'ERROR: unable to extract title')
1635                         return
1636                 video_title = mobj.group(1).decode('utf-8')
1637                 video_title = sanitize_title(video_title)
1638
1639                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1640                 if mobj is None:
1641                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1642                         return
1643                 video_uploader = mobj.group(1)
1644
1645                 try:
1646                         # Process video information
1647                         self._downloader.process_info({
1648                                 'id':           video_id.decode('utf-8'),
1649                                 'url':          video_url.decode('utf-8'),
1650                                 'uploader':     video_uploader.decode('utf-8'),
1651                                 'upload_date':  u'NA',
1652                                 'title':        video_title,
1653                                 'stitle':       simple_title,
1654                                 'ext':          video_extension.decode('utf-8'),
1655                                 'format':       u'NA',
1656                                 'player_url':   None,
1657                         })
1658                 except UnavailableVideoError:
1659                         self._downloader.trouble(u'\nERROR: unable to download video')
1660
1661
1662 class GoogleIE(InfoExtractor):
1663         """Information extractor for video.google.com."""
1664
1665         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1666         IE_NAME = u'video.google'
1667
1668         def __init__(self, downloader=None):
1669                 InfoExtractor.__init__(self, downloader)
1670
1671         def report_download_webpage(self, video_id):
1672                 """Report webpage download."""
1673                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1674
1675         def report_extraction(self, video_id):
1676                 """Report information extraction."""
1677                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1678
1679         def _real_extract(self, url):
1680                 # Extract id from URL
1681                 mobj = re.match(self._VALID_URL, url)
1682                 if mobj is None:
1683                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1684                         return
1685
1686                 # At this point we have a new video
1687                 self._downloader.increment_downloads()
1688                 video_id = mobj.group(1)
1689
1690                 video_extension = 'mp4'
1691
1692                 # Retrieve video webpage to extract further information
1693                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1694                 try:
1695                         self.report_download_webpage(video_id)
1696                         webpage = urllib2.urlopen(request).read()
1697                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1698                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1699                         return
1700
1701                 # Extract URL, uploader, and title from webpage
1702                 self.report_extraction(video_id)
1703                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1704                 if mobj is None:
1705                         video_extension = 'flv'
1706                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1707                 if mobj is None:
1708                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1709                         return
1710                 mediaURL = urllib.unquote(mobj.group(1))
1711                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1712                 mediaURL = mediaURL.replace('\\x26', '\x26')
1713
1714                 video_url = mediaURL
1715
1716                 mobj = re.search(r'<title>(.*)</title>', webpage)
1717                 if mobj is None:
1718                         self._downloader.trouble(u'ERROR: unable to extract title')
1719                         return
1720                 video_title = mobj.group(1).decode('utf-8')
1721                 video_title = sanitize_title(video_title)
1722                 simple_title = _simplify_title(video_title)
1723
1724                 # Extract video description
1725                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1726                 if mobj is None:
1727                         self._downloader.trouble(u'ERROR: unable to extract video description')
1728                         return
1729                 video_description = mobj.group(1).decode('utf-8')
1730                 if not video_description:
1731                         video_description = 'No description available.'
1732
1733                 # Extract video thumbnail
1734                 if self._downloader.params.get('forcethumbnail', False):
1735                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1736                         try:
1737                                 webpage = urllib2.urlopen(request).read()
1738                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1739                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1740                                 return
1741                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1742                         if mobj is None:
1743                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1744                                 return
1745                         video_thumbnail = mobj.group(1)
1746                 else:   # we need something to pass to process_info
1747                         video_thumbnail = ''
1748
1749                 try:
1750                         # Process video information
1751                         self._downloader.process_info({
1752                                 'id':           video_id.decode('utf-8'),
1753                                 'url':          video_url.decode('utf-8'),
1754                                 'uploader':     u'NA',
1755                                 'upload_date':  u'NA',
1756                                 'title':        video_title,
1757                                 'stitle':       simple_title,
1758                                 'ext':          video_extension.decode('utf-8'),
1759                                 'format':       u'NA',
1760                                 'player_url':   None,
1761                         })
1762                 except UnavailableVideoError:
1763                         self._downloader.trouble(u'\nERROR: unable to download video')
1764
1765
1766 class PhotobucketIE(InfoExtractor):
1767         """Information extractor for photobucket.com."""
1768
1769         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1770         IE_NAME = u'photobucket'
1771
1772         def __init__(self, downloader=None):
1773                 InfoExtractor.__init__(self, downloader)
1774
1775         def report_download_webpage(self, video_id):
1776                 """Report webpage download."""
1777                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1778
1779         def report_extraction(self, video_id):
1780                 """Report information extraction."""
1781                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1782
1783         def _real_extract(self, url):
1784                 # Extract id from URL
1785                 mobj = re.match(self._VALID_URL, url)
1786                 if mobj is None:
1787                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1788                         return
1789
1790                 # At this point we have a new video
1791                 self._downloader.increment_downloads()
1792                 video_id = mobj.group(1)
1793
1794                 video_extension = 'flv'
1795
1796                 # Retrieve video webpage to extract further information
1797                 request = urllib2.Request(url)
1798                 try:
1799                         self.report_download_webpage(video_id)
1800                         webpage = urllib2.urlopen(request).read()
1801                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1802                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1803                         return
1804
1805                 # Extract URL, uploader, and title from webpage
1806                 self.report_extraction(video_id)
1807                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1808                 if mobj is None:
1809                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1810                         return
1811                 mediaURL = urllib.unquote(mobj.group(1))
1812
1813                 video_url = mediaURL
1814
1815                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1816                 if mobj is None:
1817                         self._downloader.trouble(u'ERROR: unable to extract title')
1818                         return
1819                 video_title = mobj.group(1).decode('utf-8')
1820                 video_title = sanitize_title(video_title)
1821                 simple_title = _simplify_title(vide_title)
1822
1823                 video_uploader = mobj.group(2).decode('utf-8')
1824
1825                 try:
1826                         # Process video information
1827                         self._downloader.process_info({
1828                                 'id':           video_id.decode('utf-8'),
1829                                 'url':          video_url.decode('utf-8'),
1830                                 'uploader':     video_uploader,
1831                                 'upload_date':  u'NA',
1832                                 'title':        video_title,
1833                                 'stitle':       simple_title,
1834                                 'ext':          video_extension.decode('utf-8'),
1835                                 'format':       u'NA',
1836                                 'player_url':   None,
1837                         })
1838                 except UnavailableVideoError:
1839                         self._downloader.trouble(u'\nERROR: unable to download video')
1840
1841
1842 class YahooIE(InfoExtractor):
1843         """Information extractor for video.yahoo.com."""
1844
1845         # _VALID_URL matches all Yahoo! Video URLs
1846         # _VPAGE_URL matches only the extractable '/watch/' URLs
1847         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1848         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1849         IE_NAME = u'video.yahoo'
1850
1851         def __init__(self, downloader=None):
1852                 InfoExtractor.__init__(self, downloader)
1853
1854         def report_download_webpage(self, video_id):
1855                 """Report webpage download."""
1856                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1857
1858         def report_extraction(self, video_id):
1859                 """Report information extraction."""
1860                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1861
1862         def _real_extract(self, url, new_video=True):
1863                 # Extract ID from URL
1864                 mobj = re.match(self._VALID_URL, url)
1865                 if mobj is None:
1866                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1867                         return
1868
1869                 # At this point we have a new video
1870                 self._downloader.increment_downloads()
1871                 video_id = mobj.group(2)
1872                 video_extension = 'flv'
1873
1874                 # Rewrite valid but non-extractable URLs as
1875                 # extractable English language /watch/ URLs
1876                 if re.match(self._VPAGE_URL, url) is None:
1877                         request = urllib2.Request(url)
1878                         try:
1879                                 webpage = urllib2.urlopen(request).read()
1880                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1881                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1882                                 return
1883
1884                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1885                         if mobj is None:
1886                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1887                                 return
1888                         yahoo_id = mobj.group(1)
1889
1890                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1891                         if mobj is None:
1892                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1893                                 return
1894                         yahoo_vid = mobj.group(1)
1895
1896                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1897                         return self._real_extract(url, new_video=False)
1898
1899                 # Retrieve video webpage to extract further information
1900                 request = urllib2.Request(url)
1901                 try:
1902                         self.report_download_webpage(video_id)
1903                         webpage = urllib2.urlopen(request).read()
1904                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1905                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1906                         return
1907
1908                 # Extract uploader and title from webpage
1909                 self.report_extraction(video_id)
1910                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1911                 if mobj is None:
1912                         self._downloader.trouble(u'ERROR: unable to extract video title')
1913                         return
1914                 video_title = mobj.group(1).decode('utf-8')
1915                 simple_title = _simplify_title(video_title)
1916
1917                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1918                 if mobj is None:
1919                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1920                         return
1921                 video_uploader = mobj.group(1).decode('utf-8')
1922
1923                 # Extract video thumbnail
1924                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1925                 if mobj is None:
1926                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1927                         return
1928                 video_thumbnail = mobj.group(1).decode('utf-8')
1929
1930                 # Extract video description
1931                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1932                 if mobj is None:
1933                         self._downloader.trouble(u'ERROR: unable to extract video description')
1934                         return
1935                 video_description = mobj.group(1).decode('utf-8')
1936                 if not video_description:
1937                         video_description = 'No description available.'
1938
1939                 # Extract video height and width
1940                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1941                 if mobj is None:
1942                         self._downloader.trouble(u'ERROR: unable to extract video height')
1943                         return
1944                 yv_video_height = mobj.group(1)
1945
1946                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1947                 if mobj is None:
1948                         self._downloader.trouble(u'ERROR: unable to extract video width')
1949                         return
1950                 yv_video_width = mobj.group(1)
1951
1952                 # Retrieve video playlist to extract media URL
1953                 # I'm not completely sure what all these options are, but we
1954                 # seem to need most of them, otherwise the server sends a 401.
1955                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1956                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1957                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1958                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1959                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1960                 try:
1961                         self.report_download_webpage(video_id)
1962                         webpage = urllib2.urlopen(request).read()
1963                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1964                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1965                         return
1966
1967                 # Extract media URL from playlist XML
1968                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1969                 if mobj is None:
1970                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1971                         return
1972                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1973                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1974
1975                 try:
1976                         # Process video information
1977                         self._downloader.process_info({
1978                                 'id':           video_id.decode('utf-8'),
1979                                 'url':          video_url,
1980                                 'uploader':     video_uploader,
1981                                 'upload_date':  u'NA',
1982                                 'title':        video_title,
1983                                 'stitle':       simple_title,
1984                                 'ext':          video_extension.decode('utf-8'),
1985                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1986                                 'description':  video_description,
1987                                 'thumbnail':    video_thumbnail,
1988                                 'player_url':   None,
1989                         })
1990                 except UnavailableVideoError:
1991                         self._downloader.trouble(u'\nERROR: unable to download video')
1992
1993
1994 class VimeoIE(InfoExtractor):
1995         """Information extractor for vimeo.com."""
1996
1997         # _VALID_URL matches Vimeo URLs
1998         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1999         IE_NAME = u'vimeo'
2000
2001         def __init__(self, downloader=None):
2002                 InfoExtractor.__init__(self, downloader)
2003
2004         def report_download_webpage(self, video_id):
2005                 """Report webpage download."""
2006                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2007
2008         def report_extraction(self, video_id):
2009                 """Report information extraction."""
2010                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2011
2012         def _real_extract(self, url, new_video=True):
2013                 # Extract ID from URL
2014                 mobj = re.match(self._VALID_URL, url)
2015                 if mobj is None:
2016                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2017                         return
2018
2019                 # At this point we have a new video
2020                 self._downloader.increment_downloads()
2021                 video_id = mobj.group(1)
2022
2023                 # Retrieve video webpage to extract further information
2024                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2025                 try:
2026                         self.report_download_webpage(video_id)
2027                         webpage = urllib2.urlopen(request).read()
2028                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2029                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2030                         return
2031
2032                 # Now we begin extracting as much information as we can from what we
2033                 # retrieved. First we extract the information common to all extractors,
2034                 # and latter we extract those that are Vimeo specific.
2035                 self.report_extraction(video_id)
2036
2037                 # Extract title
2038                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2039                 if mobj is None:
2040                         self._downloader.trouble(u'ERROR: unable to extract video title')
2041                         return
2042                 video_title = mobj.group(1).decode('utf-8')
2043                 simple_title = _simplify_title(video_title)
2044
2045                 # Extract uploader
2046                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2047                 if mobj is None:
2048                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2049                         return
2050                 video_uploader = mobj.group(1).decode('utf-8')
2051
2052                 # Extract video thumbnail
2053                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2054                 if mobj is None:
2055                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2056                         return
2057                 video_thumbnail = mobj.group(1).decode('utf-8')
2058
2059                 # # Extract video description
2060                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2061                 # if mobj is None:
2062                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2063                 #       return
2064                 # video_description = mobj.group(1).decode('utf-8')
2065                 # if not video_description: video_description = 'No description available.'
2066                 video_description = 'Foo.'
2067
2068                 # Vimeo specific: extract request signature
2069                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2070                 if mobj is None:
2071                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2072                         return
2073                 sig = mobj.group(1).decode('utf-8')
2074
2075                 # Vimeo specific: extract video quality information
2076                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2077                 if mobj is None:
2078                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2079                         return
2080                 quality = mobj.group(1).decode('utf-8')
2081
2082                 if int(quality) == 1:
2083                         quality = 'hd'
2084                 else:
2085                         quality = 'sd'
2086
2087                 # Vimeo specific: Extract request signature expiration
2088                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2089                 if mobj is None:
2090                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2091                         return
2092                 sig_exp = mobj.group(1).decode('utf-8')
2093
2094                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2095
2096                 try:
2097                         # Process video information
2098                         self._downloader.process_info({
2099                                 'id':           video_id.decode('utf-8'),
2100                                 'url':          video_url,
2101                                 'uploader':     video_uploader,
2102                                 'upload_date':  u'NA',
2103                                 'title':        video_title,
2104                                 'stitle':       simple_title,
2105                                 'ext':          u'mp4',
2106                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2107                                 'description':  video_description,
2108                                 'thumbnail':    video_thumbnail,
2109                                 'description':  video_description,
2110                                 'player_url':   None,
2111                         })
2112                 except UnavailableVideoError:
2113                         self._downloader.trouble(u'ERROR: unable to download video')
2114
2115
2116 class GenericIE(InfoExtractor):
2117         """Generic last-resort information extractor."""
2118
2119         _VALID_URL = r'.*'
2120         IE_NAME = u'generic'
2121
2122         def __init__(self, downloader=None):
2123                 InfoExtractor.__init__(self, downloader)
2124
2125         def report_download_webpage(self, video_id):
2126                 """Report webpage download."""
2127                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2128                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2129
2130         def report_extraction(self, video_id):
2131                 """Report information extraction."""
2132                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2133
2134         def _real_extract(self, url):
2135                 # At this point we have a new video
2136                 self._downloader.increment_downloads()
2137
2138                 video_id = url.split('/')[-1]
2139                 request = urllib2.Request(url)
2140                 try:
2141                         self.report_download_webpage(video_id)
2142                         webpage = urllib2.urlopen(request).read()
2143                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2144                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2145                         return
2146                 except ValueError, err:
2147                         # since this is the last-resort InfoExtractor, if
2148                         # this error is thrown, it'll be thrown here
2149                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2150                         return
2151
2152                 self.report_extraction(video_id)
2153                 # Start with something easy: JW Player in SWFObject
2154                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2155                 if mobj is None:
2156                         # Broaden the search a little bit
2157                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2158                 if mobj is None:
2159                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2160                         return
2161
2162                 # It's possible that one of the regexes
2163                 # matched, but returned an empty group:
2164                 if mobj.group(1) is None:
2165                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2166                         return
2167
2168                 video_url = urllib.unquote(mobj.group(1))
2169                 video_id = os.path.basename(video_url)
2170
2171                 # here's a fun little line of code for you:
2172                 video_extension = os.path.splitext(video_id)[1][1:]
2173                 video_id = os.path.splitext(video_id)[0]
2174
2175                 # it's tempting to parse this further, but you would
2176                 # have to take into account all the variations like
2177                 #   Video Title - Site Name
2178                 #   Site Name | Video Title
2179                 #   Video Title - Tagline | Site Name
2180                 # and so on and so forth; it's just not practical
2181                 mobj = re.search(r'<title>(.*)</title>', webpage)
2182                 if mobj is None:
2183                         self._downloader.trouble(u'ERROR: unable to extract title')
2184                         return
2185                 video_title = mobj.group(1).decode('utf-8')
2186                 video_title = sanitize_title(video_title)
2187                 simple_title = _simplify_title(video_title)
2188
2189                 # video uploader is domain name
2190                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2191                 if mobj is None:
2192                         self._downloader.trouble(u'ERROR: unable to extract title')
2193                         return
2194                 video_uploader = mobj.group(1).decode('utf-8')
2195
2196                 try:
2197                         # Process video information
2198                         self._downloader.process_info({
2199                                 'id':           video_id.decode('utf-8'),
2200                                 'url':          video_url.decode('utf-8'),
2201                                 'uploader':     video_uploader,
2202                                 'upload_date':  u'NA',
2203                                 'title':        video_title,
2204                                 'stitle':       simple_title,
2205                                 'ext':          video_extension.decode('utf-8'),
2206                                 'format':       u'NA',
2207                                 'player_url':   None,
2208                         })
2209                 except UnavailableVideoError, err:
2210                         self._downloader.trouble(u'\nERROR: unable to download video')
2211
2212
2213 class YoutubeSearchIE(InfoExtractor):
2214         """Information Extractor for YouTube search queries."""
2215         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2216         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2217         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2218         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2219         _youtube_ie = None
2220         _max_youtube_results = 1000
2221         IE_NAME = u'youtube:search'
2222
2223         def __init__(self, youtube_ie, downloader=None):
2224                 InfoExtractor.__init__(self, downloader)
2225                 self._youtube_ie = youtube_ie
2226
2227         def report_download_page(self, query, pagenum):
2228                 """Report attempt to download playlist page with given number."""
2229                 query = query.decode(preferredencoding())
2230                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2231
2232         def _real_initialize(self):
2233                 self._youtube_ie.initialize()
2234
2235         def _real_extract(self, query):
2236                 mobj = re.match(self._VALID_URL, query)
2237                 if mobj is None:
2238                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2239                         return
2240
2241                 prefix, query = query.split(':')
2242                 prefix = prefix[8:]
2243                 query = query.encode('utf-8')
2244                 if prefix == '':
2245                         self._download_n_results(query, 1)
2246                         return
2247                 elif prefix == 'all':
2248                         self._download_n_results(query, self._max_youtube_results)
2249                         return
2250                 else:
2251                         try:
2252                                 n = long(prefix)
2253                                 if n <= 0:
2254                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2255                                         return
2256                                 elif n > self._max_youtube_results:
2257                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2258                                         n = self._max_youtube_results
2259                                 self._download_n_results(query, n)
2260                                 return
2261                         except ValueError: # parsing prefix as integer fails
2262                                 self._download_n_results(query, 1)
2263                                 return
2264
2265         def _download_n_results(self, query, n):
2266                 """Downloads a specified number of results for a query"""
2267
2268                 video_ids = []
2269                 already_seen = set()
2270                 pagenum = 1
2271
2272                 while True:
2273                         self.report_download_page(query, pagenum)
2274                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2275                         request = urllib2.Request(result_url)
2276                         try:
2277                                 page = urllib2.urlopen(request).read()
2278                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2279                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2280                                 return
2281
2282                         # Extract video identifiers
2283                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2284                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2285                                 if video_id not in already_seen:
2286                                         video_ids.append(video_id)
2287                                         already_seen.add(video_id)
2288                                         if len(video_ids) == n:
2289                                                 # Specified n videos reached
2290                                                 for id in video_ids:
2291                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2292                                                 return
2293
2294                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2295                                 for id in video_ids:
2296                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2297                                 return
2298
2299                         pagenum = pagenum + 1
2300
2301
2302 class GoogleSearchIE(InfoExtractor):
2303         """Information Extractor for Google Video search queries."""
2304         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2305         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2306         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2307         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2308         _google_ie = None
2309         _max_google_results = 1000
2310         IE_NAME = u'video.google:search'
2311
2312         def __init__(self, google_ie, downloader=None):
2313                 InfoExtractor.__init__(self, downloader)
2314                 self._google_ie = google_ie
2315
2316         def report_download_page(self, query, pagenum):
2317                 """Report attempt to download playlist page with given number."""
2318                 query = query.decode(preferredencoding())
2319                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2320
2321         def _real_initialize(self):
2322                 self._google_ie.initialize()
2323
2324         def _real_extract(self, query):
2325                 mobj = re.match(self._VALID_URL, query)
2326                 if mobj is None:
2327                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2328                         return
2329
2330                 prefix, query = query.split(':')
2331                 prefix = prefix[8:]
2332                 query = query.encode('utf-8')
2333                 if prefix == '':
2334                         self._download_n_results(query, 1)
2335                         return
2336                 elif prefix == 'all':
2337                         self._download_n_results(query, self._max_google_results)
2338                         return
2339                 else:
2340                         try:
2341                                 n = long(prefix)
2342                                 if n <= 0:
2343                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2344                                         return
2345                                 elif n > self._max_google_results:
2346                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2347                                         n = self._max_google_results
2348                                 self._download_n_results(query, n)
2349                                 return
2350                         except ValueError: # parsing prefix as integer fails
2351                                 self._download_n_results(query, 1)
2352                                 return
2353
2354         def _download_n_results(self, query, n):
2355                 """Downloads a specified number of results for a query"""
2356
2357                 video_ids = []
2358                 already_seen = set()
2359                 pagenum = 1
2360
2361                 while True:
2362                         self.report_download_page(query, pagenum)
2363                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2364                         request = urllib2.Request(result_url)
2365                         try:
2366                                 page = urllib2.urlopen(request).read()
2367                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2368                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2369                                 return
2370
2371                         # Extract video identifiers
2372                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2373                                 video_id = mobj.group(1)
2374                                 if video_id not in already_seen:
2375                                         video_ids.append(video_id)
2376                                         already_seen.add(video_id)
2377                                         if len(video_ids) == n:
2378                                                 # Specified n videos reached
2379                                                 for id in video_ids:
2380                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2381                                                 return
2382
2383                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2384                                 for id in video_ids:
2385                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2386                                 return
2387
2388                         pagenum = pagenum + 1
2389
2390
2391 class YahooSearchIE(InfoExtractor):
2392         """Information Extractor for Yahoo! Video search queries."""
2393         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2394         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2395         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2396         _MORE_PAGES_INDICATOR = r'\s*Next'
2397         _yahoo_ie = None
2398         _max_yahoo_results = 1000
2399         IE_NAME = u'video.yahoo:search'
2400
2401         def __init__(self, yahoo_ie, downloader=None):
2402                 InfoExtractor.__init__(self, downloader)
2403                 self._yahoo_ie = yahoo_ie
2404
2405         def report_download_page(self, query, pagenum):
2406                 """Report attempt to download playlist page with given number."""
2407                 query = query.decode(preferredencoding())
2408                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2409
2410         def _real_initialize(self):
2411                 self._yahoo_ie.initialize()
2412
2413         def _real_extract(self, query):
2414                 mobj = re.match(self._VALID_URL, query)
2415                 if mobj is None:
2416                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2417                         return
2418
2419                 prefix, query = query.split(':')
2420                 prefix = prefix[8:]
2421                 query = query.encode('utf-8')
2422                 if prefix == '':
2423                         self._download_n_results(query, 1)
2424                         return
2425                 elif prefix == 'all':
2426                         self._download_n_results(query, self._max_yahoo_results)
2427                         return
2428                 else:
2429                         try:
2430                                 n = long(prefix)
2431                                 if n <= 0:
2432                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2433                                         return
2434                                 elif n > self._max_yahoo_results:
2435                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2436                                         n = self._max_yahoo_results
2437                                 self._download_n_results(query, n)
2438                                 return
2439                         except ValueError: # parsing prefix as integer fails
2440                                 self._download_n_results(query, 1)
2441                                 return
2442
2443         def _download_n_results(self, query, n):
2444                 """Downloads a specified number of results for a query"""
2445
2446                 video_ids = []
2447                 already_seen = set()
2448                 pagenum = 1
2449
2450                 while True:
2451                         self.report_download_page(query, pagenum)
2452                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2453                         request = urllib2.Request(result_url)
2454                         try:
2455                                 page = urllib2.urlopen(request).read()
2456                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2457                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2458                                 return
2459
2460                         # Extract video identifiers
2461                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2462                                 video_id = mobj.group(1)
2463                                 if video_id not in already_seen:
2464                                         video_ids.append(video_id)
2465                                         already_seen.add(video_id)
2466                                         if len(video_ids) == n:
2467                                                 # Specified n videos reached
2468                                                 for id in video_ids:
2469                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2470                                                 return
2471
2472                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2473                                 for id in video_ids:
2474                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2475                                 return
2476
2477                         pagenum = pagenum + 1
2478
2479
2480 class YoutubePlaylistIE(InfoExtractor):
2481         """Information Extractor for YouTube playlists."""
2482
2483         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2484         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2485         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2486         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2487         _youtube_ie = None
2488         IE_NAME = u'youtube:playlist'
2489
2490         def __init__(self, youtube_ie, downloader=None):
2491                 InfoExtractor.__init__(self, downloader)
2492                 self._youtube_ie = youtube_ie
2493
2494         def report_download_page(self, playlist_id, pagenum):
2495                 """Report attempt to download playlist page with given number."""
2496                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2497
2498         def _real_initialize(self):
2499                 self._youtube_ie.initialize()
2500
2501         def _real_extract(self, url):
2502                 # Extract playlist id
2503                 mobj = re.match(self._VALID_URL, url)
2504                 if mobj is None:
2505                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2506                         return
2507
2508                 # Single video case
2509                 if mobj.group(3) is not None:
2510                         self._youtube_ie.extract(mobj.group(3))
2511                         return
2512
2513                 # Download playlist pages
2514                 # prefix is 'p' as default for playlists but there are other types that need extra care
2515                 playlist_prefix = mobj.group(1)
2516                 if playlist_prefix == 'a':
2517                         playlist_access = 'artist'
2518                 else:
2519                         playlist_prefix = 'p'
2520                         playlist_access = 'view_play_list'
2521                 playlist_id = mobj.group(2)
2522                 video_ids = []
2523                 pagenum = 1
2524
2525                 while True:
2526                         self.report_download_page(playlist_id, pagenum)
2527                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2528                         request = urllib2.Request(url)
2529                         try:
2530                                 page = urllib2.urlopen(request).read()
2531                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2532                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2533                                 return
2534
2535                         # Extract video identifiers
2536                         ids_in_page = []
2537                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2538                                 if mobj.group(1) not in ids_in_page:
2539                                         ids_in_page.append(mobj.group(1))
2540                         video_ids.extend(ids_in_page)
2541
2542                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2543                                 break
2544                         pagenum = pagenum + 1
2545
2546                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2547                 playlistend = self._downloader.params.get('playlistend', -1)
2548                 video_ids = video_ids[playliststart:playlistend]
2549
2550                 for id in video_ids:
2551                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2552                 return
2553
2554
2555 class YoutubeUserIE(InfoExtractor):
2556         """Information Extractor for YouTube users."""
2557
2558         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2559         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2560         _GDATA_PAGE_SIZE = 50
2561         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2562         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2563         _youtube_ie = None
2564         IE_NAME = u'youtube:user'
2565
2566         def __init__(self, youtube_ie, downloader=None):
2567                 InfoExtractor.__init__(self, downloader)
2568                 self._youtube_ie = youtube_ie
2569
2570         def report_download_page(self, username, start_index):
2571                 """Report attempt to download user page."""
2572                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2573                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2574
2575         def _real_initialize(self):
2576                 self._youtube_ie.initialize()
2577
2578         def _real_extract(self, url):
2579                 # Extract username
2580                 mobj = re.match(self._VALID_URL, url)
2581                 if mobj is None:
2582                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2583                         return
2584
2585                 username = mobj.group(1)
2586
2587                 # Download video ids using YouTube Data API. Result size per
2588                 # query is limited (currently to 50 videos) so we need to query
2589                 # page by page until there are no video ids - it means we got
2590                 # all of them.
2591
2592                 video_ids = []
2593                 pagenum = 0
2594
2595                 while True:
2596                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2597                         self.report_download_page(username, start_index)
2598
2599                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2600
2601                         try:
2602                                 page = urllib2.urlopen(request).read()
2603                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2604                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2605                                 return
2606
2607                         # Extract video identifiers
2608                         ids_in_page = []
2609
2610                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2611                                 if mobj.group(1) not in ids_in_page:
2612                                         ids_in_page.append(mobj.group(1))
2613
2614                         video_ids.extend(ids_in_page)
2615
2616                         # A little optimization - if current page is not
2617                         # "full", ie. does not contain PAGE_SIZE video ids then
2618                         # we can assume that this page is the last one - there
2619                         # are no more ids on further pages - no need to query
2620                         # again.
2621
2622                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2623                                 break
2624
2625                         pagenum += 1
2626
2627                 all_ids_count = len(video_ids)
2628                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2629                 playlistend = self._downloader.params.get('playlistend', -1)
2630
2631                 if playlistend == -1:
2632                         video_ids = video_ids[playliststart:]
2633                 else:
2634                         video_ids = video_ids[playliststart:playlistend]
2635
2636                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2637                                 (username, all_ids_count, len(video_ids)))
2638
2639                 for video_id in video_ids:
2640                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2641
2642
2643 class DepositFilesIE(InfoExtractor):
2644         """Information extractor for depositfiles.com"""
2645
2646         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2647         IE_NAME = u'DepositFiles'
2648
2649         def __init__(self, downloader=None):
2650                 InfoExtractor.__init__(self, downloader)
2651
2652         def report_download_webpage(self, file_id):
2653                 """Report webpage download."""
2654                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2655
2656         def report_extraction(self, file_id):
2657                 """Report information extraction."""
2658                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2659
2660         def _real_extract(self, url):
2661                 # At this point we have a new file
2662                 self._downloader.increment_downloads()
2663
2664                 file_id = url.split('/')[-1]
2665                 # Rebuild url in english locale
2666                 url = 'http://depositfiles.com/en/files/' + file_id
2667
2668                 # Retrieve file webpage with 'Free download' button pressed
2669                 free_download_indication = { 'gateway_result' : '1' }
2670                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2671                 try:
2672                         self.report_download_webpage(file_id)
2673                         webpage = urllib2.urlopen(request).read()
2674                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2675                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2676                         return
2677
2678                 # Search for the real file URL
2679                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2680                 if (mobj is None) or (mobj.group(1) is None):
2681                         # Try to figure out reason of the error.
2682                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2683                         if (mobj is not None) and (mobj.group(1) is not None):
2684                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2685                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2686                         else:
2687                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2688                         return
2689
2690                 file_url = mobj.group(1)
2691                 file_extension = os.path.splitext(file_url)[1][1:]
2692
2693                 # Search for file title
2694                 mobj = re.search(r'<b title="(.*?)">', webpage)
2695                 if mobj is None:
2696                         self._downloader.trouble(u'ERROR: unable to extract title')
2697                         return
2698                 file_title = mobj.group(1).decode('utf-8')
2699
2700                 try:
2701                         # Process file information
2702                         self._downloader.process_info({
2703                                 'id':           file_id.decode('utf-8'),
2704                                 'url':          file_url.decode('utf-8'),
2705                                 'uploader':     u'NA',
2706                                 'upload_date':  u'NA',
2707                                 'title':        file_title,
2708                                 'stitle':       file_title,
2709                                 'ext':          file_extension.decode('utf-8'),
2710                                 'format':       u'NA',
2711                                 'player_url':   None,
2712                         })
2713                 except UnavailableVideoError, err:
2714                         self._downloader.trouble(u'ERROR: unable to download file')
2715
2716
2717 class FacebookIE(InfoExtractor):
2718         """Information Extractor for Facebook"""
2719
2720         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2721         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2722         _NETRC_MACHINE = 'facebook'
2723         _available_formats = ['video', 'highqual', 'lowqual']
2724         _video_extensions = {
2725                 'video': 'mp4',
2726                 'highqual': 'mp4',
2727                 'lowqual': 'mp4',
2728         }
2729         IE_NAME = u'facebook'
2730
2731         def __init__(self, downloader=None):
2732                 InfoExtractor.__init__(self, downloader)
2733
2734         def _reporter(self, message):
2735                 """Add header and report message."""
2736                 self._downloader.to_screen(u'[facebook] %s' % message)
2737
2738         def report_login(self):
2739                 """Report attempt to log in."""
2740                 self._reporter(u'Logging in')
2741
2742         def report_video_webpage_download(self, video_id):
2743                 """Report attempt to download video webpage."""
2744                 self._reporter(u'%s: Downloading video webpage' % video_id)
2745
2746         def report_information_extraction(self, video_id):
2747                 """Report attempt to extract video information."""
2748                 self._reporter(u'%s: Extracting video information' % video_id)
2749
2750         def _parse_page(self, video_webpage):
2751                 """Extract video information from page"""
2752                 # General data
2753                 data = {'title': r'\("video_title", "(.*?)"\)',
2754                         'description': r'<div class="datawrap">(.*?)</div>',
2755                         'owner': r'\("video_owner_name", "(.*?)"\)',
2756                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2757                         }
2758                 video_info = {}
2759                 for piece in data.keys():
2760                         mobj = re.search(data[piece], video_webpage)
2761                         if mobj is not None:
2762                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2763
2764                 # Video urls
2765                 video_urls = {}
2766                 for fmt in self._available_formats:
2767                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2768                         if mobj is not None:
2769                                 # URL is in a Javascript segment inside an escaped Unicode format within
2770                                 # the generally utf-8 page
2771                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2772                 video_info['video_urls'] = video_urls
2773
2774                 return video_info
2775
2776         def _real_initialize(self):
2777                 if self._downloader is None:
2778                         return
2779
2780                 useremail = None
2781                 password = None
2782                 downloader_params = self._downloader.params
2783
2784                 # Attempt to use provided username and password or .netrc data
2785                 if downloader_params.get('username', None) is not None:
2786                         useremail = downloader_params['username']
2787                         password = downloader_params['password']
2788                 elif downloader_params.get('usenetrc', False):
2789                         try:
2790                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2791                                 if info is not None:
2792                                         useremail = info[0]
2793                                         password = info[2]
2794                                 else:
2795                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2796                         except (IOError, netrc.NetrcParseError), err:
2797                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2798                                 return
2799
2800                 if useremail is None:
2801                         return
2802
2803                 # Log in
2804                 login_form = {
2805                         'email': useremail,
2806                         'pass': password,
2807                         'login': 'Log+In'
2808                         }
2809                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2810                 try:
2811                         self.report_login()
2812                         login_results = urllib2.urlopen(request).read()
2813                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2814                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2815                                 return
2816                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2817                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2818                         return
2819
2820         def _real_extract(self, url):
2821                 mobj = re.match(self._VALID_URL, url)
2822                 if mobj is None:
2823                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2824                         return
2825                 video_id = mobj.group('ID')
2826
2827                 # Get video webpage
2828                 self.report_video_webpage_download(video_id)
2829                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2830                 try:
2831                         page = urllib2.urlopen(request)
2832                         video_webpage = page.read()
2833                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2834                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2835                         return
2836
2837                 # Start extracting information
2838                 self.report_information_extraction(video_id)
2839
2840                 # Extract information
2841                 video_info = self._parse_page(video_webpage)
2842
2843                 # uploader
2844                 if 'owner' not in video_info:
2845                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2846                         return
2847                 video_uploader = video_info['owner']
2848
2849                 # title
2850                 if 'title' not in video_info:
2851                         self._downloader.trouble(u'ERROR: unable to extract video title')
2852                         return
2853                 video_title = video_info['title']
2854                 video_title = video_title.decode('utf-8')
2855                 video_title = sanitize_title(video_title)
2856
2857                 simple_title = _simplify_title(video_title)
2858
2859                 # thumbnail image
2860                 if 'thumbnail' not in video_info:
2861                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2862                         video_thumbnail = ''
2863                 else:
2864                         video_thumbnail = video_info['thumbnail']
2865
2866                 # upload date
2867                 upload_date = u'NA'
2868                 if 'upload_date' in video_info:
2869                         upload_time = video_info['upload_date']
2870                         timetuple = email.utils.parsedate_tz(upload_time)
2871                         if timetuple is not None:
2872                                 try:
2873                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2874                                 except:
2875                                         pass
2876
2877                 # description
2878                 video_description = video_info.get('description', 'No description available.')
2879
2880                 url_map = video_info['video_urls']
2881                 if len(url_map.keys()) > 0:
2882                         # Decide which formats to download
2883                         req_format = self._downloader.params.get('format', None)
2884                         format_limit = self._downloader.params.get('format_limit', None)
2885
2886                         if format_limit is not None and format_limit in self._available_formats:
2887                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2888                         else:
2889                                 format_list = self._available_formats
2890                         existing_formats = [x for x in format_list if x in url_map]
2891                         if len(existing_formats) == 0:
2892                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2893                                 return
2894                         if req_format is None:
2895                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2896                         elif req_format == 'worst':
2897                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2898                         elif req_format == '-1':
2899                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2900                         else:
2901                                 # Specific format
2902                                 if req_format not in url_map:
2903                                         self._downloader.trouble(u'ERROR: requested format not available')
2904                                         return
2905                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2906
2907                 for format_param, video_real_url in video_url_list:
2908
2909                         # At this point we have a new video
2910                         self._downloader.increment_downloads()
2911
2912                         # Extension
2913                         video_extension = self._video_extensions.get(format_param, 'mp4')
2914
2915                         try:
2916                                 # Process video information
2917                                 self._downloader.process_info({
2918                                         'id':           video_id.decode('utf-8'),
2919                                         'url':          video_real_url.decode('utf-8'),
2920                                         'uploader':     video_uploader.decode('utf-8'),
2921                                         'upload_date':  upload_date,
2922                                         'title':        video_title,
2923                                         'stitle':       simple_title,
2924                                         'ext':          video_extension.decode('utf-8'),
2925                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2926                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2927                                         'description':  video_description.decode('utf-8'),
2928                                         'player_url':   None,
2929                                 })
2930                         except UnavailableVideoError, err:
2931                                 self._downloader.trouble(u'\nERROR: unable to download video')
2932
2933 class BlipTVIE(InfoExtractor):
2934         """Information extractor for blip.tv"""
2935
2936         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2937         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2938         IE_NAME = u'blip.tv'
2939
2940         def report_extraction(self, file_id):
2941                 """Report information extraction."""
2942                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2943
2944         def report_direct_download(self, title):
2945                 """Report information extraction."""
2946                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2947
2948         def _real_extract(self, url):
2949                 mobj = re.match(self._VALID_URL, url)
2950                 if mobj is None:
2951                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2952                         return
2953
2954                 if '?' in url:
2955                         cchar = '&'
2956                 else:
2957                         cchar = '?'
2958                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2959                 request = urllib2.Request(json_url)
2960                 self.report_extraction(mobj.group(1))
2961                 info = None
2962                 try:
2963                         urlh = urllib2.urlopen(request)
2964                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2965                                 basename = url.split('/')[-1]
2966                                 title,ext = os.path.splitext(basename)
2967                                 title = title.decode('UTF-8')
2968                                 ext = ext.replace('.', '')
2969                                 self.report_direct_download(title)
2970                                 info = {
2971                                         'id': title,
2972                                         'url': url,
2973                                         'title': title,
2974                                         'stitle': _simplify_title(title),
2975                                         'ext': ext,
2976                                         'urlhandle': urlh
2977                                 }
2978                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2979                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2980                         return
2981                 if info is None: # Regular URL
2982                         try:
2983                                 json_code = urlh.read()
2984                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2985                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2986                                 return
2987
2988                         try:
2989                                 json_data = json.loads(json_code)
2990                                 if 'Post' in json_data:
2991                                         data = json_data['Post']
2992                                 else:
2993                                         data = json_data
2994         
2995                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2996                                 video_url = data['media']['url']
2997                                 umobj = re.match(self._URL_EXT, video_url)
2998                                 if umobj is None:
2999                                         raise ValueError('Can not determine filename extension')
3000                                 ext = umobj.group(1)
3001         
3002                                 info = {
3003                                         'id': data['item_id'],
3004                                         'url': video_url,
3005                                         'uploader': data['display_name'],
3006                                         'upload_date': upload_date,
3007                                         'title': data['title'],
3008                                         'stitle': _simplify_title(data['title']),
3009                                         'ext': ext,
3010                                         'format': data['media']['mimeType'],
3011                                         'thumbnail': data['thumbnailUrl'],
3012                                         'description': data['description'],
3013                                         'player_url': data['embedUrl']
3014                                 }
3015                         except (ValueError,KeyError), err:
3016                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3017                                 return
3018
3019                 self._downloader.increment_downloads()
3020
3021                 try:
3022                         self._downloader.process_info(info)
3023                 except UnavailableVideoError, err:
3024                         self._downloader.trouble(u'\nERROR: unable to download video')
3025
3026
3027 class MyVideoIE(InfoExtractor):
3028         """Information Extractor for myvideo.de."""
3029
3030         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3031         IE_NAME = u'myvideo'
3032
3033         def __init__(self, downloader=None):
3034                 InfoExtractor.__init__(self, downloader)
3035         
3036         def report_download_webpage(self, video_id):
3037                 """Report webpage download."""
3038                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3039
3040         def report_extraction(self, video_id):
3041                 """Report information extraction."""
3042                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3043
3044         def _real_extract(self,url):
3045                 mobj = re.match(self._VALID_URL, url)
3046                 if mobj is None:
3047                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3048                         return
3049
3050                 video_id = mobj.group(1)
3051
3052                 # Get video webpage
3053                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3054                 try:
3055                         self.report_download_webpage(video_id)
3056                         webpage = urllib2.urlopen(request).read()
3057                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3058                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3059                         return
3060
3061                 self.report_extraction(video_id)
3062                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3063                                  webpage)
3064                 if mobj is None:
3065                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3066                         return
3067                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3068
3069                 mobj = re.search('<title>([^<]+)</title>', webpage)
3070                 if mobj is None:
3071                         self._downloader.trouble(u'ERROR: unable to extract title')
3072                         return
3073
3074                 video_title = mobj.group(1)
3075                 video_title = sanitize_title(video_title)
3076
3077                 simple_title = _simplify_title(video_title)
3078
3079                 try:
3080                         self._downloader.process_info({
3081                                 'id':           video_id,
3082                                 'url':          video_url,
3083                                 'uploader':     u'NA',
3084                                 'upload_date':  u'NA',
3085                                 'title':        video_title,
3086                                 'stitle':       simple_title,
3087                                 'ext':          u'flv',
3088                                 'format':       u'NA',
3089                                 'player_url':   None,
3090                         })
3091                 except UnavailableVideoError:
3092                         self._downloader.trouble(u'\nERROR: Unable to download video')
3093
3094 class ComedyCentralIE(InfoExtractor):
3095         """Information extractor for The Daily Show and Colbert Report """
3096
3097         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3098         IE_NAME = u'comedycentral'
3099
3100         def report_extraction(self, episode_id):
3101                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3102         
3103         def report_config_download(self, episode_id):
3104                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3105
3106         def report_index_download(self, episode_id):
3107                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3108
3109         def report_player_url(self, episode_id):
3110                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3111
3112         def _real_extract(self, url):
3113                 mobj = re.match(self._VALID_URL, url)
3114                 if mobj is None:
3115                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3116                         return
3117
3118                 if mobj.group('shortname'):
3119                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3120                                 url = u'http://www.thedailyshow.com/full-episodes/'
3121                         else:
3122                                 url = u'http://www.colbertnation.com/full-episodes/'
3123                         mobj = re.match(self._VALID_URL, url)
3124                         assert mobj is not None
3125
3126                 dlNewest = not mobj.group('episode')
3127                 if dlNewest:
3128                         epTitle = mobj.group('showname')
3129                 else:
3130                         epTitle = mobj.group('episode')
3131
3132                 req = urllib2.Request(url)
3133                 self.report_extraction(epTitle)
3134                 try:
3135                         htmlHandle = urllib2.urlopen(req)
3136                         html = htmlHandle.read()
3137                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3138                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3139                         return
3140                 if dlNewest:
3141                         url = htmlHandle.geturl()
3142                         mobj = re.match(self._VALID_URL, url)
3143                         if mobj is None:
3144                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3145                                 return
3146                         if mobj.group('episode') == '':
3147                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3148                                 return
3149                         epTitle = mobj.group('episode')
3150
3151                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3152                 if len(mMovieParams) == 0:
3153                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3154                         return
3155
3156                 playerUrl_raw = mMovieParams[0][0]
3157                 self.report_player_url(epTitle)
3158                 try:
3159                         urlHandle = urllib2.urlopen(playerUrl_raw)
3160                         playerUrl = urlHandle.geturl()
3161                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3162                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3163                         return
3164
3165                 uri = mMovieParams[0][1]
3166                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3167                 self.report_index_download(epTitle)
3168                 try:
3169                         indexXml = urllib2.urlopen(indexUrl).read()
3170                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3171                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3172                         return
3173
3174                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3175                 itemEls = idoc.findall('.//item')
3176                 for itemEl in itemEls:
3177                         mediaId = itemEl.findall('./guid')[0].text
3178                         shortMediaId = mediaId.split(':')[-1]
3179                         showId = mediaId.split(':')[-2].replace('.com', '')
3180                         officialTitle = itemEl.findall('./title')[0].text
3181                         officialDate = itemEl.findall('./pubDate')[0].text
3182
3183                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3184                                                 urllib.urlencode({'uri': mediaId}))
3185                         configReq = urllib2.Request(configUrl)
3186                         self.report_config_download(epTitle)
3187                         try:
3188                                 configXml = urllib2.urlopen(configReq).read()
3189                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3190                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3191                                 return
3192
3193                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3194                         turls = []
3195                         for rendition in cdoc.findall('.//rendition'):
3196                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3197                                 turls.append(finfo)
3198
3199                         if len(turls) == 0:
3200                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3201                                 continue
3202
3203                         # For now, just pick the highest bitrate
3204                         format,video_url = turls[-1]
3205
3206                         self._downloader.increment_downloads()
3207
3208                         effTitle = showId + u'-' + epTitle
3209                         info = {
3210                                 'id': shortMediaId,
3211                                 'url': video_url,
3212                                 'uploader': showId,
3213                                 'upload_date': officialDate,
3214                                 'title': effTitle,
3215                                 'stitle': _simplify_title(effTitle),
3216                                 'ext': 'mp4',
3217                                 'format': format,
3218                                 'thumbnail': None,
3219                                 'description': officialTitle,
3220                                 'player_url': playerUrl
3221                         }
3222
3223                         try:
3224                                 self._downloader.process_info(info)
3225                         except UnavailableVideoError, err:
3226                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3227                                 continue
3228
3229
3230 class EscapistIE(InfoExtractor):
3231         """Information extractor for The Escapist """
3232
3233         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3234         IE_NAME = u'escapist'
3235
3236         def report_extraction(self, showName):
3237                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3238
3239         def report_config_download(self, showName):
3240                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3241
3242         def _real_extract(self, url):
3243                 htmlParser = HTMLParser.HTMLParser()
3244
3245                 mobj = re.match(self._VALID_URL, url)
3246                 if mobj is None:
3247                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3248                         return
3249                 showName = mobj.group('showname')
3250                 videoId = mobj.group('episode')
3251
3252                 self.report_extraction(showName)
3253                 try:
3254                         webPage = urllib2.urlopen(url).read()
3255                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3256                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3257                         return
3258
3259                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3260                 description = htmlParser.unescape(descMatch.group(1))
3261                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3262                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3263                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3264                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3265                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3266                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3267
3268                 self.report_config_download(showName)
3269                 try:
3270                         configJSON = urllib2.urlopen(configUrl).read()
3271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3272                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3273                         return
3274
3275                 # Technically, it's JavaScript, not JSON
3276                 configJSON = configJSON.replace("'", '"')
3277
3278                 try:
3279                         config = json.loads(configJSON)
3280                 except (ValueError,), err:
3281                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3282                         return
3283
3284                 playlist = config['playlist']
3285                 videoUrl = playlist[1]['url']
3286
3287                 self._downloader.increment_downloads()
3288                 info = {
3289                         'id': videoId,
3290                         'url': videoUrl,
3291                         'uploader': showName,
3292                         'upload_date': None,
3293                         'title': showName,
3294                         'stitle': _simplify_title(showName),
3295                         'ext': 'flv',
3296                         'format': 'flv',
3297                         'thumbnail': imgUrl,
3298                         'description': description,
3299                         'player_url': playerUrl,
3300                 }
3301
3302                 try:
3303                         self._downloader.process_info(info)
3304                 except UnavailableVideoError, err:
3305                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3306
3307
3308 class CollegeHumorIE(InfoExtractor):
3309         """Information extractor for collegehumor.com"""
3310
3311         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3312         IE_NAME = u'collegehumor'
3313
3314         def report_webpage(self, video_id):
3315                 """Report information extraction."""
3316                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3317
3318         def report_extraction(self, video_id):
3319                 """Report information extraction."""
3320                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3321
3322         def _real_extract(self, url):
3323                 htmlParser = HTMLParser.HTMLParser()
3324
3325                 mobj = re.match(self._VALID_URL, url)
3326                 if mobj is None:
3327                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3328                         return
3329                 video_id = mobj.group('videoid')
3330
3331                 self.report_webpage(video_id)
3332                 request = urllib2.Request(url)
3333                 try:
3334                         webpage = urllib2.urlopen(request).read()
3335                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3336                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3337                         return
3338
3339                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3340                 if m is None:
3341                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3342                         return
3343                 internal_video_id = m.group('internalvideoid')
3344
3345                 info = {
3346                         'id': video_id,
3347                         'internal_id': internal_video_id,
3348                 }
3349
3350                 self.report_extraction(video_id)
3351                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3352                 try:
3353                         metaXml = urllib2.urlopen(xmlUrl).read()
3354                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3355                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3356                         return
3357
3358                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3359                 try:
3360                         videoNode = mdoc.findall('./video')[0]
3361                         info['description'] = videoNode.findall('./description')[0].text
3362                         info['title'] = videoNode.findall('./caption')[0].text
3363                         info['stitle'] = _simplify_title(info['title'])
3364                         info['url'] = videoNode.findall('./file')[0].text
3365                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3366                         info['ext'] = info['url'].rpartition('.')[2]
3367                         info['format'] = info['ext']
3368                 except IndexError:
3369                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3370                         return
3371
3372                 self._downloader.increment_downloads()
3373
3374                 try:
3375                         self._downloader.process_info(info)
3376                 except UnavailableVideoError, err:
3377                         self._downloader.trouble(u'\nERROR: unable to download video')
3378
3379
3380 class XVideosIE(InfoExtractor):
3381         """Information extractor for xvideos.com"""
3382
3383         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3384         IE_NAME = u'xvideos'
3385
3386         def report_webpage(self, video_id):
3387                 """Report information extraction."""
3388                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3389
3390         def report_extraction(self, video_id):
3391                 """Report information extraction."""
3392                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3393
3394         def _real_extract(self, url):
3395                 htmlParser = HTMLParser.HTMLParser()
3396
3397                 mobj = re.match(self._VALID_URL, url)
3398                 if mobj is None:
3399                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3400                         return
3401                 video_id = mobj.group(1).decode('utf-8')
3402
3403                 self.report_webpage(video_id)
3404
3405                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3406                 try:
3407                         webpage = urllib2.urlopen(request).read()
3408                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3409                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3410                         return
3411
3412                 self.report_extraction(video_id)
3413
3414
3415                 # Extract video URL
3416                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3417                 if mobj is None:
3418                         self._downloader.trouble(u'ERROR: unable to extract video url')
3419                         return
3420                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3421
3422
3423                 # Extract title
3424                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3425                 if mobj is None:
3426                         self._downloader.trouble(u'ERROR: unable to extract video title')
3427                         return
3428                 video_title = mobj.group(1).decode('utf-8')
3429
3430
3431                 # Extract video thumbnail
3432                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3433                 if mobj is None:
3434                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3435                         return
3436                 video_thumbnail = mobj.group(1).decode('utf-8')
3437
3438
3439
3440                 self._downloader.increment_downloads()
3441                 info = {
3442                         'id': video_id,
3443                         'url': video_url,
3444                         'uploader': None,
3445                         'upload_date': None,
3446                         'title': video_title,
3447                         'stitle': _simplify_title(video_title),
3448                         'ext': 'flv',
3449                         'format': 'flv',
3450                         'thumbnail': video_thumbnail,
3451                         'description': None,
3452                         'player_url': None,
3453                 }
3454
3455                 try:
3456                         self._downloader.process_info(info)
3457                 except UnavailableVideoError, err:
3458                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3459
3460
3461 class SoundcloudIE(InfoExtractor):
3462         """Information extractor for soundcloud.com
3463            To access the media, the uid of the song and a stream token
3464            must be extracted from the page source and the script must make
3465            a request to media.soundcloud.com/crossdomain.xml. Then
3466            the media can be grabbed by requesting from an url composed
3467            of the stream token and uid
3468          """
3469
3470         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3471         IE_NAME = u'soundcloud'
3472
3473         def __init__(self, downloader=None):
3474                 InfoExtractor.__init__(self, downloader)
3475
3476         def report_webpage(self, video_id):
3477                 """Report information extraction."""
3478                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3479
3480         def report_extraction(self, video_id):
3481                 """Report information extraction."""
3482                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3483
3484         def _real_extract(self, url):
3485                 htmlParser = HTMLParser.HTMLParser()
3486
3487                 mobj = re.match(self._VALID_URL, url)
3488                 if mobj is None:
3489                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3490                         return
3491
3492                 # extract uploader (which is in the url)
3493                 uploader = mobj.group(1).decode('utf-8')
3494                 # extract simple title (uploader + slug of song title)
3495                 slug_title =  mobj.group(2).decode('utf-8')
3496                 simple_title = uploader + '-' + slug_title
3497
3498                 self.report_webpage('%s/%s' % (uploader, slug_title))
3499
3500                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3501                 try:
3502                         webpage = urllib2.urlopen(request).read()
3503                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3504                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3505                         return
3506
3507                 self.report_extraction('%s/%s' % (uploader, slug_title))
3508
3509                 # extract uid and stream token that soundcloud hands out for access
3510                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3511                 if mobj:
3512                         video_id = mobj.group(1)
3513                         stream_token = mobj.group(2)
3514
3515                 # extract unsimplified title
3516                 mobj = re.search('"title":"(.*?)",', webpage)
3517                 if mobj:
3518                         title = mobj.group(1)
3519
3520                 # construct media url (with uid/token)
3521                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3522                 mediaURL = mediaURL % (video_id, stream_token)
3523
3524                 # description
3525                 description = u'No description available'
3526                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3527                 if mobj:
3528                         description = mobj.group(1)
3529                 
3530                 # upload date
3531                 upload_date = None
3532                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3533                 if mobj:
3534                         try:
3535                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3536                         except Exception, e:
3537                                 print str(e)
3538
3539                 # for soundcloud, a request to a cross domain is required for cookies
3540                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3541
3542                 try:
3543                         self._downloader.process_info({
3544                                 'id':           video_id.decode('utf-8'),
3545                                 'url':          mediaURL,
3546                                 'uploader':     uploader.decode('utf-8'),
3547                                 'upload_date':  upload_date,
3548                                 'title':        simple_title.decode('utf-8'),
3549                                 'stitle':       simple_title.decode('utf-8'),
3550                                 'ext':          u'mp3',
3551                                 'format':       u'NA',
3552                                 'player_url':   None,
3553                                 'description': description.decode('utf-8')
3554                         })
3555                 except UnavailableVideoError:
3556                         self._downloader.trouble(u'\nERROR: unable to download video')
3557
3558
3559 class InfoQIE(InfoExtractor):
3560         """Information extractor for infoq.com"""
3561
3562         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3563         IE_NAME = u'infoq'
3564
3565         def report_webpage(self, video_id):
3566                 """Report information extraction."""
3567                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3568
3569         def report_extraction(self, video_id):
3570                 """Report information extraction."""
3571                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3572
3573         def _real_extract(self, url):
3574                 htmlParser = HTMLParser.HTMLParser()
3575
3576                 mobj = re.match(self._VALID_URL, url)
3577                 if mobj is None:
3578                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3579                         return
3580
3581                 self.report_webpage(url)
3582
3583                 request = urllib2.Request(url)
3584                 try:
3585                         webpage = urllib2.urlopen(request).read()
3586                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3587                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3588                         return
3589
3590                 self.report_extraction(url)
3591
3592
3593                 # Extract video URL
3594                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3595                 if mobj is None:
3596                         self._downloader.trouble(u'ERROR: unable to extract video url')
3597                         return
3598                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3599
3600
3601                 # Extract title
3602                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3603                 if mobj is None:
3604                         self._downloader.trouble(u'ERROR: unable to extract video title')
3605                         return
3606                 video_title = mobj.group(1).decode('utf-8')
3607
3608                 # Extract description
3609                 video_description = u'No description available.'
3610                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3611                 if mobj is not None:
3612                         video_description = mobj.group(1).decode('utf-8')
3613
3614                 video_filename = video_url.split('/')[-1]
3615                 video_id, extension = video_filename.split('.')
3616
3617                 self._downloader.increment_downloads()
3618                 info = {
3619                         'id': video_id,
3620                         'url': video_url,
3621                         'uploader': None,
3622                         'upload_date': None,
3623                         'title': video_title,
3624                         'stitle': _simplify_title(video_title),
3625                         'ext': extension,
3626                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3627                         'thumbnail': None,
3628                         'description': video_description,
3629                         'player_url': None,
3630                 }
3631
3632                 try:
3633                         self._downloader.process_info(info)
3634                 except UnavailableVideoError, err:
3635                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3636
3637 class MixcloudIE(InfoExtractor):
3638         """Information extractor for www.mixcloud.com"""
3639         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3640         IE_NAME = u'mixcloud'
3641
3642         def __init__(self, downloader=None):
3643                 InfoExtractor.__init__(self, downloader)
3644
3645         def report_download_json(self, file_id):
3646                 """Report JSON download."""
3647                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3648
3649         def report_extraction(self, file_id):
3650                 """Report information extraction."""
3651                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3652
3653         def get_urls(self, jsonData, fmt, bitrate='best'):
3654                 """Get urls from 'audio_formats' section in json"""
3655                 file_url = None
3656                 try:
3657                         bitrate_list = jsonData[fmt]
3658                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3659                                 bitrate = max(bitrate_list) # select highest
3660
3661                         url_list = jsonData[fmt][bitrate]
3662                 except TypeError: # we have no bitrate info.
3663                         url_list = jsonData[fmt]
3664                                 
3665                 return url_list
3666
3667         def check_urls(self, url_list):
3668                 """Returns 1st active url from list"""
3669                 for url in url_list:
3670                         try:
3671                                 urllib2.urlopen(url)
3672                                 return url
3673                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3674                                 url = None
3675
3676                 return None
3677
3678         def _print_formats(self, formats):
3679                 print 'Available formats:'
3680                 for fmt in formats.keys():
3681                         for b in formats[fmt]:
3682                                 try:
3683                                         ext = formats[fmt][b][0]
3684                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3685                                 except TypeError: # we have no bitrate info
3686                                         ext = formats[fmt][0]
3687                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3688                                         break
3689
3690         def _real_extract(self, url):
3691                 mobj = re.match(self._VALID_URL, url)
3692                 if mobj is None:
3693                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3694                         return
3695                 # extract uploader & filename from url
3696                 uploader = mobj.group(1).decode('utf-8')
3697                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3698
3699                 # construct API request
3700                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3701                 # retrieve .json file with links to files
3702                 request = urllib2.Request(file_url)
3703                 try:
3704                         self.report_download_json(file_url)
3705                         jsonData = urllib2.urlopen(request).read()
3706                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3707                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3708                         return
3709
3710                 # parse JSON
3711                 json_data = json.loads(jsonData)
3712                 player_url = json_data['player_swf_url']
3713                 formats = dict(json_data['audio_formats'])
3714
3715                 req_format = self._downloader.params.get('format', None)
3716                 bitrate = None
3717
3718                 if self._downloader.params.get('listformats', None):
3719                         self._print_formats(formats)
3720                         return
3721
3722                 if req_format is None or req_format == 'best':
3723                         for format_param in formats.keys():
3724                                 url_list = self.get_urls(formats, format_param)
3725                                 # check urls
3726                                 file_url = self.check_urls(url_list)
3727                                 if file_url is not None:
3728                                         break # got it!
3729                 else:
3730                         if req_format not in formats.keys():
3731                                 self._downloader.trouble(u'ERROR: format is not available')
3732                                 return
3733
3734                         url_list = self.get_urls(formats, req_format)
3735                         file_url = self.check_urls(url_list)
3736                         format_param = req_format
3737
3738                 # We have audio
3739                 self._downloader.increment_downloads()
3740                 try:
3741                         # Process file information
3742                         self._downloader.process_info({
3743                                 'id':           file_id.decode('utf-8'),
3744                                 'url':          file_url.decode('utf-8'),
3745                                 'uploader':     uploader.decode('utf-8'),
3746                                 'upload_date':  u'NA',
3747                                 'title':        json_data['name'],
3748                                 'stitle':       _simplify_title(json_data['name']),
3749                                 'ext':          file_url.split('.')[-1].decode('utf-8'),
3750                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3751                                 'thumbnail':    json_data['thumbnail_url'],
3752                                 'description':  json_data['description'],
3753                                 'player_url':   player_url.decode('utf-8'),
3754                         })
3755                 except UnavailableVideoError, err:
3756                         self._downloader.trouble(u'ERROR: unable to download file')
3757
3758 class StanfordOpenClassroomIE(InfoExtractor):
3759         """Information extractor for Stanford's Open ClassRoom"""
3760
3761         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3762         IE_NAME = u'stanfordoc'
3763
3764         def report_download_webpage(self, objid):
3765                 """Report information extraction."""
3766                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3767
3768         def report_extraction(self, video_id):
3769                 """Report information extraction."""
3770                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3771
3772         def _real_extract(self, url):
3773                 mobj = re.match(self._VALID_URL, url)
3774                 if mobj is None:
3775                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3776                         return
3777
3778                 if mobj.group('course') and mobj.group('video'): # A specific video
3779                         course = mobj.group('course')
3780                         video = mobj.group('video')
3781                         info = {
3782                                 'id': _simplify_title(course + '_' + video),
3783                         }
3784         
3785                         self.report_extraction(info['id'])
3786                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3787                         xmlUrl = baseUrl + video + '.xml'
3788                         try:
3789                                 metaXml = urllib2.urlopen(xmlUrl).read()
3790                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3791                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3792                                 return
3793                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3794                         try:
3795                                 info['title'] = mdoc.findall('./title')[0].text
3796                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3797                         except IndexError:
3798                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3799                                 return
3800                         info['stitle'] = _simplify_title(info['title'])
3801                         info['ext'] = info['url'].rpartition('.')[2]
3802                         info['format'] = info['ext']
3803                         self._downloader.increment_downloads()
3804                         try:
3805                                 self._downloader.process_info(info)
3806                         except UnavailableVideoError, err:
3807                                 self._downloader.trouble(u'\nERROR: unable to download video')
3808                 elif mobj.group('course'): # A course page
3809                         unescapeHTML = HTMLParser.HTMLParser().unescape
3810
3811                         course = mobj.group('course')
3812                         info = {
3813                                 'id': _simplify_title(course),
3814                                 'type': 'playlist',
3815                         }
3816
3817                         self.report_download_webpage(info['id'])
3818                         try:
3819                                 coursepage = urllib2.urlopen(url).read()
3820                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3821                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3822                                 return
3823
3824                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3825                         if m:
3826                                 info['title'] = unescapeHTML(m.group(1))
3827                         else:
3828                                 info['title'] = info['id']
3829                         info['stitle'] = _simplify_title(info['title'])
3830
3831                         m = re.search('<description>([^<]+)</description>', coursepage)
3832                         if m:
3833                                 info['description'] = unescapeHTML(m.group(1))
3834
3835                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3836                         info['list'] = [
3837                                 {
3838                                         'type': 'reference',
3839                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3840                                 }
3841                                         for vpage in links]
3842
3843                         for entry in info['list']:
3844                                 assert entry['type'] == 'reference'
3845                                 self.extract(entry['url'])
3846                 else: # Root page
3847                         unescapeHTML = HTMLParser.HTMLParser().unescape
3848
3849                         info = {
3850                                 'id': 'Stanford OpenClassroom',
3851                                 'type': 'playlist',
3852                         }
3853
3854                         self.report_download_webpage(info['id'])
3855                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3856                         try:
3857                                 rootpage = urllib2.urlopen(rootURL).read()
3858                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3859                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3860                                 return
3861
3862                         info['title'] = info['id']
3863                         info['stitle'] = _simplify_title(info['title'])
3864
3865                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3866                         info['list'] = [
3867                                 {
3868                                         'type': 'reference',
3869                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3870                                 }
3871                                         for cpage in links]
3872
3873                         for entry in info['list']:
3874                                 assert entry['type'] == 'reference'
3875                                 self.extract(entry['url'])
3876
3877
3878 class PostProcessor(object):
3879         """Post Processor class.
3880
3881         PostProcessor objects can be added to downloaders with their
3882         add_post_processor() method. When the downloader has finished a
3883         successful download, it will take its internal chain of PostProcessors
3884         and start calling the run() method on each one of them, first with
3885         an initial argument and then with the returned value of the previous
3886         PostProcessor.
3887
3888         The chain will be stopped if one of them ever returns None or the end
3889         of the chain is reached.
3890
3891         PostProcessor objects follow a "mutual registration" process similar
3892         to InfoExtractor objects.
3893         """
3894
3895         _downloader = None
3896
3897         def __init__(self, downloader=None):
3898                 self._downloader = downloader
3899
3900         def set_downloader(self, downloader):
3901                 """Sets the downloader for this PP."""
3902                 self._downloader = downloader
3903
3904         def run(self, information):
3905                 """Run the PostProcessor.
3906
3907                 The "information" argument is a dictionary like the ones
3908                 composed by InfoExtractors. The only difference is that this
3909                 one has an extra field called "filepath" that points to the
3910                 downloaded file.
3911
3912                 When this method returns None, the postprocessing chain is
3913                 stopped. However, this method may return an information
3914                 dictionary that will be passed to the next postprocessing
3915                 object in the chain. It can be the one it received after
3916                 changing some fields.
3917
3918                 In addition, this method may raise a PostProcessingError
3919                 exception that will be taken into account by the downloader
3920                 it was called from.
3921                 """
3922                 return information # by default, do nothing
3923
3924
3925 class FFmpegExtractAudioPP(PostProcessor):
3926
3927         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3928                 PostProcessor.__init__(self, downloader)
3929                 if preferredcodec is None:
3930                         preferredcodec = 'best'
3931                 self._preferredcodec = preferredcodec
3932                 self._preferredquality = preferredquality
3933                 self._keepvideo = keepvideo
3934
3935         @staticmethod
3936         def get_audio_codec(path):
3937                 try:
3938                         cmd = ['ffprobe', '-show_streams', '--', path]
3939                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3940                         output = handle.communicate()[0]
3941                         if handle.wait() != 0:
3942                                 return None
3943                 except (IOError, OSError):
3944                         return None
3945                 audio_codec = None
3946                 for line in output.split('\n'):
3947                         if line.startswith('codec_name='):
3948                                 audio_codec = line.split('=')[1].strip()
3949                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3950                                 return audio_codec
3951                 return None
3952
3953         @staticmethod
3954         def run_ffmpeg(path, out_path, codec, more_opts):
3955                 try:
3956                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3957                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3958                         return (ret == 0)
3959                 except (IOError, OSError):
3960                         return False
3961
3962         def run(self, information):
3963                 path = information['filepath']
3964
3965                 filecodec = self.get_audio_codec(path)
3966                 if filecodec is None:
3967                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3968                         return None
3969
3970                 more_opts = []
3971                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3972                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
3973                                 # Lossless, but in another container
3974                                 acodec = 'copy'
3975                                 extension = self._preferredcodec
3976                                 more_opts = ['-absf', 'aac_adtstoasc']
3977                         elif filecodec in ['aac', 'mp3', 'vorbis']:
3978                                 # Lossless if possible
3979                                 acodec = 'copy'
3980                                 extension = filecodec
3981                                 if filecodec == 'aac':
3982                                         more_opts = ['-f', 'adts']
3983                                 if filecodec == 'vorbis':
3984                                         extension = 'ogg'
3985                         else:
3986                                 # MP3 otherwise.
3987                                 acodec = 'libmp3lame'
3988                                 extension = 'mp3'
3989                                 more_opts = []
3990                                 if self._preferredquality is not None:
3991                                         more_opts += ['-ab', self._preferredquality]
3992                 else:
3993                         # We convert the audio (lossy)
3994                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3995                         extension = self._preferredcodec
3996                         more_opts = []
3997                         if self._preferredquality is not None:
3998                                 more_opts += ['-ab', self._preferredquality]
3999                         if self._preferredcodec == 'aac':
4000                                 more_opts += ['-f', 'adts']
4001                         if self._preferredcodec == 'm4a':
4002                                 more_opts += ['-absf', 'aac_adtstoasc']
4003                         if self._preferredcodec == 'vorbis':
4004                                 extension = 'ogg'
4005
4006                 (prefix, ext) = os.path.splitext(path)
4007                 new_path = prefix + '.' + extension
4008                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4009                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
4010
4011                 if not status:
4012                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
4013                         return None
4014
4015                 # Try to update the date time for extracted audio file.
4016                 if information.get('filetime') is not None:
4017                         try:
4018                                 os.utime(new_path, (time.time(), information['filetime']))
4019                         except:
4020                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4021
4022                 if not self._keepvideo:
4023                         try:
4024                                 os.remove(path)
4025                         except (IOError, OSError):
4026                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4027                                 return None
4028
4029                 information['filepath'] = new_path
4030                 return information
4031
4032
4033 def updateSelf(downloader, filename):
4034         ''' Update the program file with the latest version from the repository '''
4035         # Note: downloader only used for options
4036         if not os.access(filename, os.W_OK):
4037                 sys.exit('ERROR: no write permissions on %s' % filename)
4038
4039         downloader.to_screen('Updating to latest version...')
4040
4041         try:
4042                 try:
4043                         urlh = urllib.urlopen(UPDATE_URL)
4044                         newcontent = urlh.read()
4045                         
4046                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4047                         if vmatch is not None and vmatch.group(1) == __version__:
4048                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4049                                 return
4050                 finally:
4051                         urlh.close()
4052         except (IOError, OSError), err:
4053                 sys.exit('ERROR: unable to download latest version')
4054
4055         try:
4056                 outf = open(filename, 'wb')
4057                 try:
4058                         outf.write(newcontent)
4059                 finally:
4060                         outf.close()
4061         except (IOError, OSError), err:
4062                 sys.exit('ERROR: unable to overwrite current version')
4063
4064         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4065
4066 def parseOpts():
4067         # Deferred imports
4068         import getpass
4069         import optparse
4070         import shlex
4071
4072         def _readOptions(filename):
4073                 try:
4074                         optionf = open(filename)
4075                 except IOError:
4076                         return [] # silently skip if file is not present
4077                 try:
4078                         res = []
4079                         for l in optionf:
4080                                 res += shlex.split(l, comments=True)
4081                 finally:
4082                         optionf.close()
4083                 return res
4084
4085         def _format_option_string(option):
4086                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4087
4088                 opts = []
4089
4090                 if option._short_opts: opts.append(option._short_opts[0])
4091                 if option._long_opts: opts.append(option._long_opts[0])
4092                 if len(opts) > 1: opts.insert(1, ', ')
4093
4094                 if option.takes_value(): opts.append(' %s' % option.metavar)
4095
4096                 return "".join(opts)
4097
4098         def _find_term_columns():
4099                 columns = os.environ.get('COLUMNS', None)
4100                 if columns:
4101                         return int(columns)
4102
4103                 try:
4104                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4105                         out,err = sp.communicate()
4106                         return int(out.split()[1])
4107                 except:
4108                         pass
4109                 return None
4110
4111         max_width = 80
4112         max_help_position = 80
4113
4114         # No need to wrap help messages if we're on a wide console
4115         columns = _find_term_columns()
4116         if columns: max_width = columns
4117
4118         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4119         fmt.format_option_strings = _format_option_string
4120
4121         kw = {
4122                 'version'   : __version__,
4123                 'formatter' : fmt,
4124                 'usage' : '%prog [options] url [url...]',
4125                 'conflict_handler' : 'resolve',
4126         }
4127
4128         parser = optparse.OptionParser(**kw)
4129
4130         # option groups
4131         general        = optparse.OptionGroup(parser, 'General Options')
4132         selection      = optparse.OptionGroup(parser, 'Video Selection')
4133         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4134         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4135         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4136         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4137         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4138
4139         general.add_option('-h', '--help',
4140                         action='help', help='print this help text and exit')
4141         general.add_option('-v', '--version',
4142                         action='version', help='print program version and exit')
4143         general.add_option('-U', '--update',
4144                         action='store_true', dest='update_self', help='update this program to latest version')
4145         general.add_option('-i', '--ignore-errors',
4146                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4147         general.add_option('-r', '--rate-limit',
4148                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4149         general.add_option('-R', '--retries',
4150                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4151         general.add_option('--dump-user-agent',
4152                         action='store_true', dest='dump_user_agent',
4153                         help='display the current browser identification', default=False)
4154         general.add_option('--list-extractors',
4155                         action='store_true', dest='list_extractors',
4156                         help='List all supported extractors and the URLs they would handle', default=False)
4157
4158         selection.add_option('--playlist-start',
4159                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4160         selection.add_option('--playlist-end',
4161                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4162         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4163         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4164         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4165
4166         authentication.add_option('-u', '--username',
4167                         dest='username', metavar='USERNAME', help='account username')
4168         authentication.add_option('-p', '--password',
4169                         dest='password', metavar='PASSWORD', help='account password')
4170         authentication.add_option('-n', '--netrc',
4171                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4172
4173
4174         video_format.add_option('-f', '--format',
4175                         action='store', dest='format', metavar='FORMAT', help='video format code')
4176         video_format.add_option('--all-formats',
4177                         action='store_const', dest='format', help='download all available video formats', const='all')
4178         video_format.add_option('--max-quality',
4179                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4180         video_format.add_option('-F', '--list-formats',
4181                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4182
4183
4184         verbosity.add_option('-q', '--quiet',
4185                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4186         verbosity.add_option('-s', '--simulate',
4187                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4188         verbosity.add_option('--skip-download',
4189                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4190         verbosity.add_option('-g', '--get-url',
4191                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4192         verbosity.add_option('-e', '--get-title',
4193                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4194         verbosity.add_option('--get-thumbnail',
4195                         action='store_true', dest='getthumbnail',
4196                         help='simulate, quiet but print thumbnail URL', default=False)
4197         verbosity.add_option('--get-description',
4198                         action='store_true', dest='getdescription',
4199                         help='simulate, quiet but print video description', default=False)
4200         verbosity.add_option('--get-filename',
4201                         action='store_true', dest='getfilename',
4202                         help='simulate, quiet but print output filename', default=False)
4203         verbosity.add_option('--get-format',
4204                         action='store_true', dest='getformat',
4205                         help='simulate, quiet but print output format', default=False)
4206         verbosity.add_option('--no-progress',
4207                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4208         verbosity.add_option('--console-title',
4209                         action='store_true', dest='consoletitle',
4210                         help='display progress in console titlebar', default=False)
4211
4212
4213         filesystem.add_option('-t', '--title',
4214                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4215         filesystem.add_option('-l', '--literal',
4216                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4217         filesystem.add_option('-A', '--auto-number',
4218                         action='store_true', dest='autonumber',
4219                         help='number downloaded files starting from 00000', default=False)
4220         filesystem.add_option('-o', '--output',
4221                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4222         filesystem.add_option('-a', '--batch-file',
4223                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4224         filesystem.add_option('-w', '--no-overwrites',
4225                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4226         filesystem.add_option('-c', '--continue',
4227                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4228         filesystem.add_option('--no-continue',
4229                         action='store_false', dest='continue_dl',
4230                         help='do not resume partially downloaded files (restart from beginning)')
4231         filesystem.add_option('--cookies',
4232                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4233         filesystem.add_option('--no-part',
4234                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4235         filesystem.add_option('--no-mtime',
4236                         action='store_false', dest='updatetime',
4237                         help='do not use the Last-modified header to set the file modification time', default=True)
4238         filesystem.add_option('--write-description',
4239                         action='store_true', dest='writedescription',
4240                         help='write video description to a .description file', default=False)
4241         filesystem.add_option('--write-info-json',
4242                         action='store_true', dest='writeinfojson',
4243                         help='write video metadata to a .info.json file', default=False)
4244
4245
4246         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4247                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4248         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4249                         help='"best", "aac", "vorbis", "mp3", or "m4a"; best by default')
4250         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4251                         help='ffmpeg audio bitrate specification, 128k by default')
4252         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4253                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4254
4255
4256         parser.add_option_group(general)
4257         parser.add_option_group(selection)
4258         parser.add_option_group(filesystem)
4259         parser.add_option_group(verbosity)
4260         parser.add_option_group(video_format)
4261         parser.add_option_group(authentication)
4262         parser.add_option_group(postproc)
4263
4264         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4265         if xdg_config_home:
4266                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4267         else:
4268                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4269         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4270         opts, args = parser.parse_args(argv)
4271
4272         return parser, opts, args
4273
4274 def gen_extractors():
4275         """ Return a list of an instance of every supported extractor.
4276         The order does matter; the first extractor matched is the one handling the URL.
4277         """
4278         youtube_ie = YoutubeIE()
4279         google_ie = GoogleIE()
4280         yahoo_ie = YahooIE()
4281         return [
4282                 YoutubePlaylistIE(youtube_ie),
4283                 YoutubeUserIE(youtube_ie),
4284                 YoutubeSearchIE(youtube_ie),
4285                 youtube_ie,
4286                 MetacafeIE(youtube_ie),
4287                 DailymotionIE(),
4288                 google_ie,
4289                 GoogleSearchIE(google_ie),
4290                 PhotobucketIE(),
4291                 yahoo_ie,
4292                 YahooSearchIE(yahoo_ie),
4293                 DepositFilesIE(),
4294                 FacebookIE(),
4295                 BlipTVIE(),
4296                 VimeoIE(),
4297                 MyVideoIE(),
4298                 ComedyCentralIE(),
4299                 EscapistIE(),
4300                 CollegeHumorIE(),
4301                 XVideosIE(),
4302                 SoundcloudIE(),
4303                 InfoQIE(),
4304                 MixcloudIE(),
4305                 StanfordOpenClassroomIE(),
4306
4307                 GenericIE()
4308         ]
4309
4310 def _real_main():
4311         parser, opts, args = parseOpts()
4312
4313         # Open appropriate CookieJar
4314         if opts.cookiefile is None:
4315                 jar = cookielib.CookieJar()
4316         else:
4317                 try:
4318                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4319                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4320                                 jar.load()
4321                 except (IOError, OSError), err:
4322                         sys.exit(u'ERROR: unable to open cookie file')
4323
4324         # Dump user agent
4325         if opts.dump_user_agent:
4326                 print std_headers['User-Agent']
4327                 sys.exit(0)
4328
4329         # Batch file verification
4330         batchurls = []
4331         if opts.batchfile is not None:
4332                 try:
4333                         if opts.batchfile == '-':
4334                                 batchfd = sys.stdin
4335                         else:
4336                                 batchfd = open(opts.batchfile, 'r')
4337                         batchurls = batchfd.readlines()
4338                         batchurls = [x.strip() for x in batchurls]
4339                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4340                 except IOError:
4341                         sys.exit(u'ERROR: batch file could not be read')
4342         all_urls = batchurls + args
4343
4344         # General configuration
4345         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4346         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4347         urllib2.install_opener(opener)
4348         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4349
4350         extractors = gen_extractors()
4351
4352         if opts.list_extractors:
4353                 for ie in extractors:
4354                         print(ie.IE_NAME)
4355                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4356                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4357                         for mu in matchedUrls:
4358                                 print(u'  ' + mu)
4359                 sys.exit(0)
4360
4361         # Conflicting, missing and erroneous options
4362         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4363                 parser.error(u'using .netrc conflicts with giving username/password')
4364         if opts.password is not None and opts.username is None:
4365                 parser.error(u'account username missing')
4366         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4367                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4368         if opts.usetitle and opts.useliteral:
4369                 parser.error(u'using title conflicts with using literal title')
4370         if opts.username is not None and opts.password is None:
4371                 opts.password = getpass.getpass(u'Type account password and press return:')
4372         if opts.ratelimit is not None:
4373                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4374                 if numeric_limit is None:
4375                         parser.error(u'invalid rate limit specified')
4376                 opts.ratelimit = numeric_limit
4377         if opts.retries is not None:
4378                 try:
4379                         opts.retries = long(opts.retries)
4380                 except (TypeError, ValueError), err:
4381                         parser.error(u'invalid retry count specified')
4382         try:
4383                 opts.playliststart = int(opts.playliststart)
4384                 if opts.playliststart <= 0:
4385                         raise ValueError(u'Playlist start must be positive')
4386         except (TypeError, ValueError), err:
4387                 parser.error(u'invalid playlist start number specified')
4388         try:
4389                 opts.playlistend = int(opts.playlistend)
4390                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4391                         raise ValueError(u'Playlist end must be greater than playlist start')
4392         except (TypeError, ValueError), err:
4393                 parser.error(u'invalid playlist end number specified')
4394         if opts.extractaudio:
4395                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a']:
4396                         parser.error(u'invalid audio format specified')
4397
4398         # File downloader
4399         fd = FileDownloader({
4400                 'usenetrc': opts.usenetrc,
4401                 'username': opts.username,
4402                 'password': opts.password,
4403                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4404                 'forceurl': opts.geturl,
4405                 'forcetitle': opts.gettitle,
4406                 'forcethumbnail': opts.getthumbnail,
4407                 'forcedescription': opts.getdescription,
4408                 'forcefilename': opts.getfilename,
4409                 'forceformat': opts.getformat,
4410                 'simulate': opts.simulate,
4411                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4412                 'format': opts.format,
4413                 'format_limit': opts.format_limit,
4414                 'listformats': opts.listformats,
4415                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4416                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4417                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4418                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4419                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4420                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4421                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4422                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4423                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4424                         or u'%(id)s.%(ext)s'),
4425                 'ignoreerrors': opts.ignoreerrors,
4426                 'ratelimit': opts.ratelimit,
4427                 'nooverwrites': opts.nooverwrites,
4428                 'retries': opts.retries,
4429                 'continuedl': opts.continue_dl,
4430                 'noprogress': opts.noprogress,
4431                 'playliststart': opts.playliststart,
4432                 'playlistend': opts.playlistend,
4433                 'logtostderr': opts.outtmpl == '-',
4434                 'consoletitle': opts.consoletitle,
4435                 'nopart': opts.nopart,
4436                 'updatetime': opts.updatetime,
4437                 'writedescription': opts.writedescription,
4438                 'writeinfojson': opts.writeinfojson,
4439                 'matchtitle': opts.matchtitle,
4440                 'rejecttitle': opts.rejecttitle,
4441                 'max_downloads': opts.max_downloads,
4442                 })
4443         for extractor in extractors:
4444                 fd.add_info_extractor(extractor)
4445
4446         # PostProcessors
4447         if opts.extractaudio:
4448                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4449
4450         # Update version
4451         if opts.update_self:
4452                 updateSelf(fd, sys.argv[0])
4453
4454         # Maybe do nothing
4455         if len(all_urls) < 1:
4456                 if not opts.update_self:
4457                         parser.error(u'you must provide at least one URL')
4458                 else:
4459                         sys.exit()
4460         
4461         try:
4462                 retcode = fd.download(all_urls)
4463         except MaxDownloadsReached:
4464                 fd.to_screen(u'--max-download limit reached, aborting.')
4465                 retcode = 101
4466
4467         # Dump cookie jar if requested
4468         if opts.cookiefile is not None:
4469                 try:
4470                         jar.save()
4471                 except (IOError, OSError), err:
4472                         sys.exit(u'ERROR: unable to save cookie jar')
4473
4474         sys.exit(retcode)
4475
4476 def main():
4477         try:
4478                 _real_main()
4479         except DownloadError:
4480                 sys.exit(1)
4481         except SameFileError:
4482                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4483         except KeyboardInterrupt:
4484                 sys.exit(u'\nERROR: Interrupted by user')
4485
4486 if __name__ == '__main__':
4487         main()
4488
4489 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: