d02178cc3f8dc3f72448a91207ea364e83f537b2
[youtube-dl.git] / youtube_dl / __init__.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2011.12.15'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48         import ctypes
49
50 try:
51         import email.utils
52 except ImportError: # Python 2.4
53         import email.Utils
54 try:
55         import cStringIO as StringIO
56 except ImportError:
57         import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61         from urlparse import parse_qs
62 except ImportError:
63         from cgi import parse_qs
64
65 try:
66         import lxml.etree
67 except ImportError:
68         pass # Handled below
69
70 try:
71         import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79         'Accept-Encoding': 'gzip, deflate',
80         'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84         import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86         import re
87         class json(object):
88                 @staticmethod
89                 def loads(s):
90                         s = s.decode('UTF-8')
91                         def raiseError(msg, i):
92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93                         def skipSpace(i, expectMore=True):
94                                 while i < len(s) and s[i] in ' \t\r\n':
95                                         i += 1
96                                 if expectMore:
97                                         if i >= len(s):
98                                                 raiseError('Premature end', i)
99                                 return i
100                         def decodeEscape(match):
101                                 esc = match.group(1)
102                                 _STATIC = {
103                                         '"': '"',
104                                         '\\': '\\',
105                                         '/': '/',
106                                         'b': unichr(0x8),
107                                         'f': unichr(0xc),
108                                         'n': '\n',
109                                         'r': '\r',
110                                         't': '\t',
111                                 }
112                                 if esc in _STATIC:
113                                         return _STATIC[esc]
114                                 if esc[0] == 'u':
115                                         if len(esc) == 1+4:
116                                                 return unichr(int(esc[1:5], 16))
117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
118                                                 hi = int(esc[1:5], 16)
119                                                 low = int(esc[7:11], 16)
120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121                                 raise ValueError('Unknown escape ' + str(esc))
122                         def parseString(i):
123                                 i += 1
124                                 e = i
125                                 while True:
126                                         e = s.index('"', e)
127                                         bslashes = 0
128                                         while s[e-bslashes-1] == '\\':
129                                                 bslashes += 1
130                                         if bslashes % 2 == 1:
131                                                 e += 1
132                                                 continue
133                                         break
134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135                                 stri = rexp.sub(decodeEscape, s[i:e])
136                                 return (e+1,stri)
137                         def parseObj(i):
138                                 i += 1
139                                 res = {}
140                                 i = skipSpace(i)
141                                 if s[i] == '}': # Empty dictionary
142                                         return (i+1,res)
143                                 while True:
144                                         if s[i] != '"':
145                                                 raiseError('Expected a string object key', i)
146                                         i,key = parseString(i)
147                                         i = skipSpace(i)
148                                         if i >= len(s) or s[i] != ':':
149                                                 raiseError('Expected a colon', i)
150                                         i,val = parse(i+1)
151                                         res[key] = val
152                                         i = skipSpace(i)
153                                         if s[i] == '}':
154                                                 return (i+1, res)
155                                         if s[i] != ',':
156                                                 raiseError('Expected comma or closing curly brace', i)
157                                         i = skipSpace(i+1)
158                         def parseArray(i):
159                                 res = []
160                                 i = skipSpace(i+1)
161                                 if s[i] == ']': # Empty array
162                                         return (i+1,res)
163                                 while True:
164                                         i,val = parse(i)
165                                         res.append(val)
166                                         i = skipSpace(i) # Raise exception if premature end
167                                         if s[i] == ']':
168                                                 return (i+1, res)
169                                         if s[i] != ',':
170                                                 raiseError('Expected a comma or closing bracket', i)
171                                         i = skipSpace(i+1)
172                         def parseDiscrete(i):
173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
174                                         if s.startswith(k, i):
175                                                 return (i+len(k), v)
176                                 raiseError('Not a boolean (or null)', i)
177                         def parseNumber(i):
178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179                                 if mobj is None:
180                                         raiseError('Not a number', i)
181                                 nums = mobj.group(1)
182                                 if '.' in nums or 'e' in nums or 'E' in nums:
183                                         return (i+len(nums), float(nums))
184                                 return (i+len(nums), int(nums))
185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186                         def parse(i):
187                                 i = skipSpace(i)
188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
189                                 i = skipSpace(i, False)
190                                 return (i,res)
191                         i,res = parse(0)
192                         if i < len(s):
193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194                         return res
195
196 def preferredencoding():
197         """Get preferred encoding.
198
199         Returns the best encoding scheme for the system, based on
200         locale.getpreferredencoding() and some further tweaks.
201         """
202         def yield_preferredencoding():
203                 try:
204                         pref = locale.getpreferredencoding()
205                         u'TEST'.encode(pref)
206                 except:
207                         pref = 'UTF-8'
208                 while True:
209                         yield pref
210         return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214         """Transforms an HTML entity to a Unicode character.
215
216         This function receives a match object and is intended to be used with
217         the re.sub() function.
218         """
219         entity = matchobj.group(1)
220
221         # Known non-numeric HTML entity
222         if entity in htmlentitydefs.name2codepoint:
223                 return unichr(htmlentitydefs.name2codepoint[entity])
224
225         # Unicode character
226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
227         if mobj is not None:
228                 numstr = mobj.group(1)
229                 if numstr.startswith(u'x'):
230                         base = 16
231                         numstr = u'0%s' % numstr
232                 else:
233                         base = 10
234                 return unichr(long(numstr, base))
235
236         # Unknown entity in name, return its literal representation
237         return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241         """Sanitizes a video title so it could be used as part of a filename."""
242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243         return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247         """Try to open the given filename, and slightly tweak it if this fails.
248
249         Attempts to open the given filename. If this fails, it tries to change
250         the filename slightly, step by step, until it's either able to open it
251         or it fails and raises a final exception, like the standard open()
252         function.
253
254         It returns the tuple (stream, definitive_file_name).
255         """
256         try:
257                 if filename == u'-':
258                         if sys.platform == 'win32':
259                                 import msvcrt
260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261                         return (sys.stdout, filename)
262                 stream = open(filename, open_mode)
263                 return (stream, filename)
264         except (IOError, OSError), err:
265                 # In case of error, try to remove win32 forbidden chars
266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268                 # An exception here should be caught in the caller
269                 stream = open(filename, open_mode)
270                 return (stream, filename)
271
272
273 def timeconvert(timestr):
274         """Convert RFC 2822 defined time string into system timestamp"""
275         timestamp = None
276         timetuple = email.utils.parsedate_tz(timestr)
277         if timetuple is not None:
278                 timestamp = email.utils.mktime_tz(timetuple)
279         return timestamp
280
281 def _simplify_title(title):
282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283         return expr.sub(u'_', title).strip(u'_')
284
285 def _orderedSet(iterable):
286         """ Remove all duplicates from the input iterable """
287         res = []
288         for el in iterable:
289                 if el not in res:
290                         res.append(el)
291         return res
292
293 class DownloadError(Exception):
294         """Download Error exception.
295
296         This exception may be thrown by FileDownloader objects if they are not
297         configured to continue on errors. They will contain the appropriate
298         error message.
299         """
300         pass
301
302
303 class SameFileError(Exception):
304         """Same File exception.
305
306         This exception will be thrown by FileDownloader objects if they detect
307         multiple files would have to be downloaded to the same file on disk.
308         """
309         pass
310
311
312 class PostProcessingError(Exception):
313         """Post Processing exception.
314
315         This exception may be raised by PostProcessor's .run() method to
316         indicate an error in the postprocessing task.
317         """
318         pass
319
320 class MaxDownloadsReached(Exception):
321         """ --max-downloads limit has been reached. """
322         pass
323
324
325 class UnavailableVideoError(Exception):
326         """Unavailable Format exception.
327
328         This exception will be thrown when a video is requested
329         in a format that is not available for that video.
330         """
331         pass
332
333
334 class ContentTooShortError(Exception):
335         """Content Too Short exception.
336
337         This exception may be raised by FileDownloader objects when a file they
338         download is too small for what the server announced first, indicating
339         the connection was probably interrupted.
340         """
341         # Both in bytes
342         downloaded = None
343         expected = None
344
345         def __init__(self, downloaded, expected):
346                 self.downloaded = downloaded
347                 self.expected = expected
348
349
350 class YoutubeDLHandler(urllib2.HTTPHandler):
351         """Handler for HTTP requests and responses.
352
353         This class, when installed with an OpenerDirector, automatically adds
354         the standard headers to every HTTP request and handles gzipped and
355         deflated responses from web servers. If compression is to be avoided in
356         a particular request, the original request in the program code only has
357         to include the HTTP header "Youtubedl-No-Compression", which will be
358         removed before making the real request.
359
360         Part of this code was copied from:
361
362         http://techknack.net/python-urllib2-handlers/
363
364         Andrew Rowls, the author of that code, agreed to release it to the
365         public domain.
366         """
367
368         @staticmethod
369         def deflate(data):
370                 try:
371                         return zlib.decompress(data, -zlib.MAX_WBITS)
372                 except zlib.error:
373                         return zlib.decompress(data)
374
375         @staticmethod
376         def addinfourl_wrapper(stream, headers, url, code):
377                 if hasattr(urllib2.addinfourl, 'getcode'):
378                         return urllib2.addinfourl(stream, headers, url, code)
379                 ret = urllib2.addinfourl(stream, headers, url)
380                 ret.code = code
381                 return ret
382
383         def http_request(self, req):
384                 for h in std_headers:
385                         if h in req.headers:
386                                 del req.headers[h]
387                         req.add_header(h, std_headers[h])
388                 if 'Youtubedl-no-compression' in req.headers:
389                         if 'Accept-encoding' in req.headers:
390                                 del req.headers['Accept-encoding']
391                         del req.headers['Youtubedl-no-compression']
392                 return req
393
394         def http_response(self, req, resp):
395                 old_resp = resp
396                 # gzip
397                 if resp.headers.get('Content-encoding', '') == 'gzip':
398                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
399                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
400                         resp.msg = old_resp.msg
401                 # deflate
402                 if resp.headers.get('Content-encoding', '') == 'deflate':
403                         gz = StringIO.StringIO(self.deflate(resp.read()))
404                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
405                         resp.msg = old_resp.msg
406                 return resp
407
408
409 class FileDownloader(object):
410         """File Downloader class.
411
412         File downloader objects are the ones responsible of downloading the
413         actual video file and writing it to disk if the user has requested
414         it, among some other tasks. In most cases there should be one per
415         program. As, given a video URL, the downloader doesn't know how to
416         extract all the needed information, task that InfoExtractors do, it
417         has to pass the URL to one of them.
418
419         For this, file downloader objects have a method that allows
420         InfoExtractors to be registered in a given order. When it is passed
421         a URL, the file downloader handles it to the first InfoExtractor it
422         finds that reports being able to handle it. The InfoExtractor extracts
423         all the information about the video or videos the URL refers to, and
424         asks the FileDownloader to process the video information, possibly
425         downloading the video.
426
427         File downloaders accept a lot of parameters. In order not to saturate
428         the object constructor with arguments, it receives a dictionary of
429         options instead. These options are available through the params
430         attribute for the InfoExtractors to use. The FileDownloader also
431         registers itself as the downloader in charge for the InfoExtractors
432         that are added to it, so this is a "mutual registration".
433
434         Available options:
435
436         username:         Username for authentication purposes.
437         password:         Password for authentication purposes.
438         usenetrc:         Use netrc for authentication instead.
439         quiet:            Do not print messages to stdout.
440         forceurl:         Force printing final URL.
441         forcetitle:       Force printing title.
442         forcethumbnail:   Force printing thumbnail URL.
443         forcedescription: Force printing description.
444         forcefilename:    Force printing final filename.
445         simulate:         Do not download the video files.
446         format:           Video format code.
447         format_limit:     Highest quality format to try.
448         outtmpl:          Template for output names.
449         ignoreerrors:     Do not stop on download errors.
450         ratelimit:        Download speed limit, in bytes/sec.
451         nooverwrites:     Prevent overwriting files.
452         retries:          Number of times to retry for HTTP error 5xx
453         continuedl:       Try to continue downloads if possible.
454         noprogress:       Do not print the progress bar.
455         playliststart:    Playlist item to start at.
456         playlistend:      Playlist item to end at.
457         matchtitle:       Download only matching titles.
458         rejecttitle:      Reject downloads for matching titles.
459         logtostderr:      Log messages to stderr instead of stdout.
460         consoletitle:     Display progress in console window's titlebar.
461         nopart:           Do not use temporary .part files.
462         updatetime:       Use the Last-modified header to set output file timestamps.
463         writedescription: Write the video description to a .description file
464         writeinfojson:    Write the video description to a .info.json file
465         """
466
467         params = None
468         _ies = []
469         _pps = []
470         _download_retcode = None
471         _num_downloads = None
472         _screen_file = None
473
474         def __init__(self, params):
475                 """Create a FileDownloader object with the given options."""
476                 self._ies = []
477                 self._pps = []
478                 self._download_retcode = 0
479                 self._num_downloads = 0
480                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
481                 self.params = params
482
483         @staticmethod
484         def format_bytes(bytes):
485                 if bytes is None:
486                         return 'N/A'
487                 if type(bytes) is str:
488                         bytes = float(bytes)
489                 if bytes == 0.0:
490                         exponent = 0
491                 else:
492                         exponent = long(math.log(bytes, 1024.0))
493                 suffix = 'bkMGTPEZY'[exponent]
494                 converted = float(bytes) / float(1024 ** exponent)
495                 return '%.2f%s' % (converted, suffix)
496
497         @staticmethod
498         def calc_percent(byte_counter, data_len):
499                 if data_len is None:
500                         return '---.-%'
501                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
502
503         @staticmethod
504         def calc_eta(start, now, total, current):
505                 if total is None:
506                         return '--:--'
507                 dif = now - start
508                 if current == 0 or dif < 0.001: # One millisecond
509                         return '--:--'
510                 rate = float(current) / dif
511                 eta = long((float(total) - float(current)) / rate)
512                 (eta_mins, eta_secs) = divmod(eta, 60)
513                 if eta_mins > 99:
514                         return '--:--'
515                 return '%02d:%02d' % (eta_mins, eta_secs)
516
517         @staticmethod
518         def calc_speed(start, now, bytes):
519                 dif = now - start
520                 if bytes == 0 or dif < 0.001: # One millisecond
521                         return '%10s' % '---b/s'
522                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
523
524         @staticmethod
525         def best_block_size(elapsed_time, bytes):
526                 new_min = max(bytes / 2.0, 1.0)
527                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
528                 if elapsed_time < 0.001:
529                         return long(new_max)
530                 rate = bytes / elapsed_time
531                 if rate > new_max:
532                         return long(new_max)
533                 if rate < new_min:
534                         return long(new_min)
535                 return long(rate)
536
537         @staticmethod
538         def parse_bytes(bytestr):
539                 """Parse a string indicating a byte quantity into a long integer."""
540                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
541                 if matchobj is None:
542                         return None
543                 number = float(matchobj.group(1))
544                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
545                 return long(round(number * multiplier))
546
547         def add_info_extractor(self, ie):
548                 """Add an InfoExtractor object to the end of the list."""
549                 self._ies.append(ie)
550                 ie.set_downloader(self)
551
552         def add_post_processor(self, pp):
553                 """Add a PostProcessor object to the end of the chain."""
554                 self._pps.append(pp)
555                 pp.set_downloader(self)
556
557         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
558                 """Print message to stdout if not in quiet mode."""
559                 try:
560                         if not self.params.get('quiet', False):
561                                 terminator = [u'\n', u''][skip_eol]
562                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
563                         self._screen_file.flush()
564                 except (UnicodeEncodeError), err:
565                         if not ignore_encoding_errors:
566                                 raise
567
568         def to_stderr(self, message):
569                 """Print message to stderr."""
570                 print >>sys.stderr, message.encode(preferredencoding())
571
572         def to_cons_title(self, message):
573                 """Set console/terminal window title to message."""
574                 if not self.params.get('consoletitle', False):
575                         return
576                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
577                         # c_wchar_p() might not be necessary if `message` is
578                         # already of type unicode()
579                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
580                 elif 'TERM' in os.environ:
581                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
582
583         def fixed_template(self):
584                 """Checks if the output template is fixed."""
585                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
586
587         def trouble(self, message=None):
588                 """Determine action to take when a download problem appears.
589
590                 Depending on if the downloader has been configured to ignore
591                 download errors or not, this method may throw an exception or
592                 not when errors are found, after printing the message.
593                 """
594                 if message is not None:
595                         self.to_stderr(message)
596                 if not self.params.get('ignoreerrors', False):
597                         raise DownloadError(message)
598                 self._download_retcode = 1
599
600         def slow_down(self, start_time, byte_counter):
601                 """Sleep if the download speed is over the rate limit."""
602                 rate_limit = self.params.get('ratelimit', None)
603                 if rate_limit is None or byte_counter == 0:
604                         return
605                 now = time.time()
606                 elapsed = now - start_time
607                 if elapsed <= 0.0:
608                         return
609                 speed = float(byte_counter) / elapsed
610                 if speed > rate_limit:
611                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
612
613         def temp_name(self, filename):
614                 """Returns a temporary filename for the given filename."""
615                 if self.params.get('nopart', False) or filename == u'-' or \
616                                 (os.path.exists(filename) and not os.path.isfile(filename)):
617                         return filename
618                 return filename + u'.part'
619
620         def undo_temp_name(self, filename):
621                 if filename.endswith(u'.part'):
622                         return filename[:-len(u'.part')]
623                 return filename
624
625         def try_rename(self, old_filename, new_filename):
626                 try:
627                         if old_filename == new_filename:
628                                 return
629                         os.rename(old_filename, new_filename)
630                 except (IOError, OSError), err:
631                         self.trouble(u'ERROR: unable to rename file')
632
633         def try_utime(self, filename, last_modified_hdr):
634                 """Try to set the last-modified time of the given file."""
635                 if last_modified_hdr is None:
636                         return
637                 if not os.path.isfile(filename):
638                         return
639                 timestr = last_modified_hdr
640                 if timestr is None:
641                         return
642                 filetime = timeconvert(timestr)
643                 if filetime is None:
644                         return filetime
645                 try:
646                         os.utime(filename, (time.time(), filetime))
647                 except:
648                         pass
649                 return filetime
650
651         def report_writedescription(self, descfn):
652                 """ Report that the description file is being written """
653                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
654
655         def report_writeinfojson(self, infofn):
656                 """ Report that the metadata file has been written """
657                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
658
659         def report_destination(self, filename):
660                 """Report destination filename."""
661                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
662
663         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
664                 """Report download progress."""
665                 if self.params.get('noprogress', False):
666                         return
667                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
668                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
669                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
670                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
671
672         def report_resuming_byte(self, resume_len):
673                 """Report attempt to resume at given byte."""
674                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
675
676         def report_retry(self, count, retries):
677                 """Report retry in case of HTTP error 5xx"""
678                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
679
680         def report_file_already_downloaded(self, file_name):
681                 """Report file has already been fully downloaded."""
682                 try:
683                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
684                 except (UnicodeEncodeError), err:
685                         self.to_screen(u'[download] The file has already been downloaded')
686
687         def report_unable_to_resume(self):
688                 """Report it was impossible to resume download."""
689                 self.to_screen(u'[download] Unable to resume')
690
691         def report_finish(self):
692                 """Report download finished."""
693                 if self.params.get('noprogress', False):
694                         self.to_screen(u'[download] Download completed')
695                 else:
696                         self.to_screen(u'')
697
698         def increment_downloads(self):
699                 """Increment the ordinal that assigns a number to each file."""
700                 self._num_downloads += 1
701
702         def prepare_filename(self, info_dict):
703                 """Generate the output filename."""
704                 try:
705                         template_dict = dict(info_dict)
706                         template_dict['epoch'] = unicode(long(time.time()))
707                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
708                         filename = self.params['outtmpl'] % template_dict
709                         return filename
710                 except (ValueError, KeyError), err:
711                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
712                         return None
713
714         def _match_entry(self, info_dict):
715                 """ Returns None iff the file should be downloaded """
716
717                 title = info_dict['title']
718                 matchtitle = self.params.get('matchtitle', False)
719                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
720                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
721                 rejecttitle = self.params.get('rejecttitle', False)
722                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
723                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
724                 return None
725
726         def process_info(self, info_dict):
727                 """Process a single dictionary returned by an InfoExtractor."""
728
729                 reason = self._match_entry(info_dict)
730                 if reason is not None:
731                         self.to_screen(u'[download] ' + reason)
732                         return
733
734                 max_downloads = self.params.get('max_downloads')
735                 if max_downloads is not None:
736                         if self._num_downloads > int(max_downloads):
737                                 raise MaxDownloadsReached()
738
739                 filename = self.prepare_filename(info_dict)
740                 
741                 # Forced printings
742                 if self.params.get('forcetitle', False):
743                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
744                 if self.params.get('forceurl', False):
745                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
746                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
747                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
748                 if self.params.get('forcedescription', False) and 'description' in info_dict:
749                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
750                 if self.params.get('forcefilename', False) and filename is not None:
751                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
752                 if self.params.get('forceformat', False):
753                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
754
755                 # Do nothing else if in simulate mode
756                 if self.params.get('simulate', False):
757                         return
758
759                 if filename is None:
760                         return
761
762                 try:
763                         dn = os.path.dirname(filename)
764                         if dn != '' and not os.path.exists(dn):
765                                 os.makedirs(dn)
766                 except (OSError, IOError), err:
767                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
768                         return
769
770                 if self.params.get('writedescription', False):
771                         try:
772                                 descfn = filename + '.description'
773                                 self.report_writedescription(descfn)
774                                 descfile = open(descfn, 'wb')
775                                 try:
776                                         descfile.write(info_dict['description'].encode('utf-8'))
777                                 finally:
778                                         descfile.close()
779                         except (OSError, IOError):
780                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
781                                 return
782
783                 if self.params.get('writeinfojson', False):
784                         infofn = filename + '.info.json'
785                         self.report_writeinfojson(infofn)
786                         try:
787                                 json.dump
788                         except (NameError,AttributeError):
789                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
790                                 return
791                         try:
792                                 infof = open(infofn, 'wb')
793                                 try:
794                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
795                                         json.dump(json_info_dict, infof)
796                                 finally:
797                                         infof.close()
798                         except (OSError, IOError):
799                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
800                                 return
801
802                 if not self.params.get('skip_download', False):
803                         if self.params.get('nooverwrites', False) and os.path.exists(filename):
804                                 success = True
805                         else:
806                                 try:
807                                         success = self._do_download(filename, info_dict)
808                                 except (OSError, IOError), err:
809                                         raise UnavailableVideoError
810                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
811                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
812                                         return
813                                 except (ContentTooShortError, ), err:
814                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
815                                         return
816         
817                         if success:
818                                 try:
819                                         self.post_process(filename, info_dict)
820                                 except (PostProcessingError), err:
821                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
822                                         return
823
824         def download(self, url_list):
825                 """Download a given list of URLs."""
826                 if len(url_list) > 1 and self.fixed_template():
827                         raise SameFileError(self.params['outtmpl'])
828
829                 for url in url_list:
830                         suitable_found = False
831                         for ie in self._ies:
832                                 # Go to next InfoExtractor if not suitable
833                                 if not ie.suitable(url):
834                                         continue
835
836                                 # Suitable InfoExtractor found
837                                 suitable_found = True
838
839                                 # Extract information from URL and process it
840                                 ie.extract(url)
841
842                                 # Suitable InfoExtractor had been found; go to next URL
843                                 break
844
845                         if not suitable_found:
846                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
847
848                 return self._download_retcode
849
850         def post_process(self, filename, ie_info):
851                 """Run the postprocessing chain on the given file."""
852                 info = dict(ie_info)
853                 info['filepath'] = filename
854                 for pp in self._pps:
855                         info = pp.run(info)
856                         if info is None:
857                                 break
858
859         def _download_with_rtmpdump(self, filename, url, player_url):
860                 self.report_destination(filename)
861                 tmpfilename = self.temp_name(filename)
862
863                 # Check for rtmpdump first
864                 try:
865                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
866                 except (OSError, IOError):
867                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
868                         return False
869
870                 # Download using rtmpdump. rtmpdump returns exit code 2 when
871                 # the connection was interrumpted and resuming appears to be
872                 # possible. This is part of rtmpdump's normal usage, AFAIK.
873                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
874                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
875                 while retval == 2 or retval == 1:
876                         prevsize = os.path.getsize(tmpfilename)
877                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
878                         time.sleep(5.0) # This seems to be needed
879                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
880                         cursize = os.path.getsize(tmpfilename)
881                         if prevsize == cursize and retval == 1:
882                                 break
883                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
884                         if prevsize == cursize and retval == 2 and cursize > 1024:
885                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
886                                 retval = 0
887                                 break
888                 if retval == 0:
889                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
890                         self.try_rename(tmpfilename, filename)
891                         return True
892                 else:
893                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
894                         return False
895
896         def _do_download(self, filename, info_dict):
897                 url = info_dict['url']
898                 player_url = info_dict.get('player_url', None)
899
900                 # Check file already present
901                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
902                         self.report_file_already_downloaded(filename)
903                         return True
904
905                 # Attempt to download using rtmpdump
906                 if url.startswith('rtmp'):
907                         return self._download_with_rtmpdump(filename, url, player_url)
908
909                 tmpfilename = self.temp_name(filename)
910                 stream = None
911
912                 # Do not include the Accept-Encoding header
913                 headers = {'Youtubedl-no-compression': 'True'}
914                 basic_request = urllib2.Request(url, None, headers)
915                 request = urllib2.Request(url, None, headers)
916
917                 # Establish possible resume length
918                 if os.path.isfile(tmpfilename):
919                         resume_len = os.path.getsize(tmpfilename)
920                 else:
921                         resume_len = 0
922
923                 open_mode = 'wb'
924                 if resume_len != 0:
925                         if self.params.get('continuedl', False):
926                                 self.report_resuming_byte(resume_len)
927                                 request.add_header('Range','bytes=%d-' % resume_len)
928                                 open_mode = 'ab'
929                         else:
930                                 resume_len = 0
931
932                 count = 0
933                 retries = self.params.get('retries', 0)
934                 while count <= retries:
935                         # Establish connection
936                         try:
937                                 if count == 0 and 'urlhandle' in info_dict:
938                                         data = info_dict['urlhandle']
939                                 data = urllib2.urlopen(request)
940                                 break
941                         except (urllib2.HTTPError, ), err:
942                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
943                                         # Unexpected HTTP error
944                                         raise
945                                 elif err.code == 416:
946                                         # Unable to resume (requested range not satisfiable)
947                                         try:
948                                                 # Open the connection again without the range header
949                                                 data = urllib2.urlopen(basic_request)
950                                                 content_length = data.info()['Content-Length']
951                                         except (urllib2.HTTPError, ), err:
952                                                 if err.code < 500 or err.code >= 600:
953                                                         raise
954                                         else:
955                                                 # Examine the reported length
956                                                 if (content_length is not None and
957                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
958                                                         # The file had already been fully downloaded.
959                                                         # Explanation to the above condition: in issue #175 it was revealed that
960                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
961                                                         # changing the file size slightly and causing problems for some users. So
962                                                         # I decided to implement a suggested change and consider the file
963                                                         # completely downloaded if the file size differs less than 100 bytes from
964                                                         # the one in the hard drive.
965                                                         self.report_file_already_downloaded(filename)
966                                                         self.try_rename(tmpfilename, filename)
967                                                         return True
968                                                 else:
969                                                         # The length does not match, we start the download over
970                                                         self.report_unable_to_resume()
971                                                         open_mode = 'wb'
972                                                         break
973                         # Retry
974                         count += 1
975                         if count <= retries:
976                                 self.report_retry(count, retries)
977
978                 if count > retries:
979                         self.trouble(u'ERROR: giving up after %s retries' % retries)
980                         return False
981
982                 data_len = data.info().get('Content-length', None)
983                 if data_len is not None:
984                         data_len = long(data_len) + resume_len
985                 data_len_str = self.format_bytes(data_len)
986                 byte_counter = 0 + resume_len
987                 block_size = 1024
988                 start = time.time()
989                 while True:
990                         # Download and write
991                         before = time.time()
992                         data_block = data.read(block_size)
993                         after = time.time()
994                         if len(data_block) == 0:
995                                 break
996                         byte_counter += len(data_block)
997
998                         # Open file just in time
999                         if stream is None:
1000                                 try:
1001                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1002                                         assert stream is not None
1003                                         filename = self.undo_temp_name(tmpfilename)
1004                                         self.report_destination(filename)
1005                                 except (OSError, IOError), err:
1006                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1007                                         return False
1008                         try:
1009                                 stream.write(data_block)
1010                         except (IOError, OSError), err:
1011                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1012                                 return False
1013                         block_size = self.best_block_size(after - before, len(data_block))
1014
1015                         # Progress message
1016                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1017                         if data_len is None:
1018                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1019                         else:
1020                                 percent_str = self.calc_percent(byte_counter, data_len)
1021                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1022                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1023
1024                         # Apply rate limit
1025                         self.slow_down(start, byte_counter - resume_len)
1026
1027                 if stream is None:
1028                         self.trouble(u'\nERROR: Did not get any data blocks')
1029                         return False
1030                 stream.close()
1031                 self.report_finish()
1032                 if data_len is not None and byte_counter != data_len:
1033                         raise ContentTooShortError(byte_counter, long(data_len))
1034                 self.try_rename(tmpfilename, filename)
1035
1036                 # Update file modification time
1037                 if self.params.get('updatetime', True):
1038                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1039
1040                 return True
1041
1042
1043 class InfoExtractor(object):
1044         """Information Extractor class.
1045
1046         Information extractors are the classes that, given a URL, extract
1047         information from the video (or videos) the URL refers to. This
1048         information includes the real video URL, the video title and simplified
1049         title, author and others. The information is stored in a dictionary
1050         which is then passed to the FileDownloader. The FileDownloader
1051         processes this information possibly downloading the video to the file
1052         system, among other possible outcomes. The dictionaries must include
1053         the following fields:
1054
1055         id:             Video identifier.
1056         url:            Final video URL.
1057         uploader:       Nickname of the video uploader.
1058         title:          Literal title.
1059         stitle:         Simplified title.
1060         ext:            Video filename extension.
1061         format:         Video format.
1062         player_url:     SWF Player URL (may be None).
1063
1064         The following fields are optional. Their primary purpose is to allow
1065         youtube-dl to serve as the backend for a video search function, such
1066         as the one in youtube2mp3.  They are only used when their respective
1067         forced printing functions are called:
1068
1069         thumbnail:      Full URL to a video thumbnail image.
1070         description:    One-line video description.
1071
1072         Subclasses of this one should re-define the _real_initialize() and
1073         _real_extract() methods and define a _VALID_URL regexp.
1074         Probably, they should also be added to the list of extractors.
1075         """
1076
1077         _ready = False
1078         _downloader = None
1079
1080         def __init__(self, downloader=None):
1081                 """Constructor. Receives an optional downloader."""
1082                 self._ready = False
1083                 self.set_downloader(downloader)
1084
1085         def suitable(self, url):
1086                 """Receives a URL and returns True if suitable for this IE."""
1087                 return re.match(self._VALID_URL, url) is not None
1088
1089         def initialize(self):
1090                 """Initializes an instance (authentication, etc)."""
1091                 if not self._ready:
1092                         self._real_initialize()
1093                         self._ready = True
1094
1095         def extract(self, url):
1096                 """Extracts URL information and returns it in list of dicts."""
1097                 self.initialize()
1098                 return self._real_extract(url)
1099
1100         def set_downloader(self, downloader):
1101                 """Sets the downloader for this IE."""
1102                 self._downloader = downloader
1103
1104         def _real_initialize(self):
1105                 """Real initialization process. Redefine in subclasses."""
1106                 pass
1107
1108         def _real_extract(self, url):
1109                 """Real extraction process. Redefine in subclasses."""
1110                 pass
1111
1112
1113 class YoutubeIE(InfoExtractor):
1114         """Information extractor for youtube.com."""
1115
1116         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1117         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1118         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1119         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1120         _NETRC_MACHINE = 'youtube'
1121         # Listed in order of quality
1122         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1123         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1124         _video_extensions = {
1125                 '13': '3gp',
1126                 '17': 'mp4',
1127                 '18': 'mp4',
1128                 '22': 'mp4',
1129                 '37': 'mp4',
1130                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1131                 '43': 'webm',
1132                 '44': 'webm',
1133                 '45': 'webm',
1134         }
1135         _video_dimensions = {
1136                 '5': '240x400',
1137                 '6': '???',
1138                 '13': '???',
1139                 '17': '144x176',
1140                 '18': '360x640',
1141                 '22': '720x1280',
1142                 '34': '360x640',
1143                 '35': '480x854',
1144                 '37': '1080x1920',
1145                 '38': '3072x4096',
1146                 '43': '360x640',
1147                 '44': '480x854',
1148                 '45': '720x1280',
1149         }       
1150         IE_NAME = u'youtube'
1151
1152         def report_lang(self):
1153                 """Report attempt to set language."""
1154                 self._downloader.to_screen(u'[youtube] Setting language')
1155
1156         def report_login(self):
1157                 """Report attempt to log in."""
1158                 self._downloader.to_screen(u'[youtube] Logging in')
1159
1160         def report_age_confirmation(self):
1161                 """Report attempt to confirm age."""
1162                 self._downloader.to_screen(u'[youtube] Confirming age')
1163
1164         def report_video_webpage_download(self, video_id):
1165                 """Report attempt to download video webpage."""
1166                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1167
1168         def report_video_info_webpage_download(self, video_id):
1169                 """Report attempt to download video info webpage."""
1170                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1171
1172         def report_information_extraction(self, video_id):
1173                 """Report attempt to extract video information."""
1174                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1175
1176         def report_unavailable_format(self, video_id, format):
1177                 """Report extracted video URL."""
1178                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1179
1180         def report_rtmp_download(self):
1181                 """Indicate the download will use the RTMP protocol."""
1182                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1183
1184         def _print_formats(self, formats):
1185                 print 'Available formats:'
1186                 for x in formats:
1187                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1188
1189         def _real_initialize(self):
1190                 if self._downloader is None:
1191                         return
1192
1193                 username = None
1194                 password = None
1195                 downloader_params = self._downloader.params
1196
1197                 # Attempt to use provided username and password or .netrc data
1198                 if downloader_params.get('username', None) is not None:
1199                         username = downloader_params['username']
1200                         password = downloader_params['password']
1201                 elif downloader_params.get('usenetrc', False):
1202                         try:
1203                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1204                                 if info is not None:
1205                                         username = info[0]
1206                                         password = info[2]
1207                                 else:
1208                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1209                         except (IOError, netrc.NetrcParseError), err:
1210                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1211                                 return
1212
1213                 # Set language
1214                 request = urllib2.Request(self._LANG_URL)
1215                 try:
1216                         self.report_lang()
1217                         urllib2.urlopen(request).read()
1218                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1220                         return
1221
1222                 # No authentication to be performed
1223                 if username is None:
1224                         return
1225
1226                 # Log in
1227                 login_form = {
1228                                 'current_form': 'loginForm',
1229                                 'next':         '/',
1230                                 'action_login': 'Log In',
1231                                 'username':     username,
1232                                 'password':     password,
1233                                 }
1234                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1235                 try:
1236                         self.report_login()
1237                         login_results = urllib2.urlopen(request).read()
1238                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1239                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1240                                 return
1241                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1242                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1243                         return
1244
1245                 # Confirm age
1246                 age_form = {
1247                                 'next_url':             '/',
1248                                 'action_confirm':       'Confirm',
1249                                 }
1250                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1251                 try:
1252                         self.report_age_confirmation()
1253                         age_results = urllib2.urlopen(request).read()
1254                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1255                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1256                         return
1257
1258         def _real_extract(self, url):
1259                 # Extract video id from URL
1260                 mobj = re.match(self._VALID_URL, url)
1261                 if mobj is None:
1262                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1263                         return
1264                 video_id = mobj.group(2)
1265
1266                 # Get video webpage
1267                 self.report_video_webpage_download(video_id)
1268                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1269                 try:
1270                         video_webpage = urllib2.urlopen(request).read()
1271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1272                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1273                         return
1274
1275                 # Attempt to extract SWF player URL
1276                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1277                 if mobj is not None:
1278                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1279                 else:
1280                         player_url = None
1281
1282                 # Get video info
1283                 self.report_video_info_webpage_download(video_id)
1284                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1285                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1286                                         % (video_id, el_type))
1287                         request = urllib2.Request(video_info_url)
1288                         try:
1289                                 video_info_webpage = urllib2.urlopen(request).read()
1290                                 video_info = parse_qs(video_info_webpage)
1291                                 if 'token' in video_info:
1292                                         break
1293                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1294                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1295                                 return
1296                 if 'token' not in video_info:
1297                         if 'reason' in video_info:
1298                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1299                         else:
1300                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1301                         return
1302
1303                 # Start extracting information
1304                 self.report_information_extraction(video_id)
1305
1306                 # uploader
1307                 if 'author' not in video_info:
1308                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1309                         return
1310                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1311
1312                 # title
1313                 if 'title' not in video_info:
1314                         self._downloader.trouble(u'ERROR: unable to extract video title')
1315                         return
1316                 video_title = urllib.unquote_plus(video_info['title'][0])
1317                 video_title = video_title.decode('utf-8')
1318                 video_title = sanitize_title(video_title)
1319
1320                 # simplified title
1321                 simple_title = _simplify_title(video_title)
1322
1323                 # thumbnail image
1324                 if 'thumbnail_url' not in video_info:
1325                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1326                         video_thumbnail = ''
1327                 else:   # don't panic if we can't find it
1328                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1329
1330                 # upload date
1331                 upload_date = u'NA'
1332                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1333                 if mobj is not None:
1334                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1335                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1336                         for expression in format_expressions:
1337                                 try:
1338                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1339                                 except:
1340                                         pass
1341
1342                 # description
1343                 try:
1344                         lxml.etree
1345                 except NameError:
1346                         video_description = u'No description available.'
1347                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1348                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1349                                 if mobj is not None:
1350                                         video_description = mobj.group(1).decode('utf-8')
1351                 else:
1352                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1353                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1354                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1355                         # TODO use another parser
1356
1357                 # token
1358                 video_token = urllib.unquote_plus(video_info['token'][0])
1359
1360                 # Decide which formats to download
1361                 req_format = self._downloader.params.get('format', None)
1362
1363                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1364                         self.report_rtmp_download()
1365                         video_url_list = [(None, video_info['conn'][0])]
1366                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1367                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1368                         url_data = [parse_qs(uds) for uds in url_data_strs]
1369                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1370                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1371
1372                         format_limit = self._downloader.params.get('format_limit', None)
1373                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1374                         if format_limit is not None and format_limit in available_formats:
1375                                 format_list = available_formats[available_formats.index(format_limit):]
1376                         else:
1377                                 format_list = available_formats
1378                         existing_formats = [x for x in format_list if x in url_map]
1379                         if len(existing_formats) == 0:
1380                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1381                                 return
1382                         if self._downloader.params.get('listformats', None):
1383                                 self._print_formats(existing_formats)
1384                                 return
1385                         if req_format is None or req_format == 'best':
1386                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1387                         elif req_format == 'worst':
1388                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1389                         elif req_format in ('-1', 'all'):
1390                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1391                         else:
1392                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1393                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1394                                 req_formats = req_format.split('/')
1395                                 video_url_list = None
1396                                 for rf in req_formats:
1397                                         if rf in url_map:
1398                                                 video_url_list = [(rf, url_map[rf])]
1399                                                 break
1400                                 if video_url_list is None:
1401                                         self._downloader.trouble(u'ERROR: requested format not available')
1402                                         return
1403                 else:
1404                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1405                         return
1406
1407                 for format_param, video_real_url in video_url_list:
1408                         # At this point we have a new video
1409                         self._downloader.increment_downloads()
1410
1411                         # Extension
1412                         video_extension = self._video_extensions.get(format_param, 'flv')
1413
1414                         try:
1415                                 # Process video information
1416                                 self._downloader.process_info({
1417                                         'id':           video_id.decode('utf-8'),
1418                                         'url':          video_real_url.decode('utf-8'),
1419                                         'uploader':     video_uploader.decode('utf-8'),
1420                                         'upload_date':  upload_date,
1421                                         'title':        video_title,
1422                                         'stitle':       simple_title,
1423                                         'ext':          video_extension.decode('utf-8'),
1424                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1425                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1426                                         'description':  video_description,
1427                                         'player_url':   player_url,
1428                                 })
1429                         except UnavailableVideoError, err:
1430                                 self._downloader.trouble(u'\nERROR: unable to download video')
1431
1432
1433 class MetacafeIE(InfoExtractor):
1434         """Information Extractor for metacafe.com."""
1435
1436         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1437         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1438         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1439         _youtube_ie = None
1440         IE_NAME = u'metacafe'
1441
1442         def __init__(self, youtube_ie, downloader=None):
1443                 InfoExtractor.__init__(self, downloader)
1444                 self._youtube_ie = youtube_ie
1445
1446         def report_disclaimer(self):
1447                 """Report disclaimer retrieval."""
1448                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1449
1450         def report_age_confirmation(self):
1451                 """Report attempt to confirm age."""
1452                 self._downloader.to_screen(u'[metacafe] Confirming age')
1453
1454         def report_download_webpage(self, video_id):
1455                 """Report webpage download."""
1456                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1457
1458         def report_extraction(self, video_id):
1459                 """Report information extraction."""
1460                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1461
1462         def _real_initialize(self):
1463                 # Retrieve disclaimer
1464                 request = urllib2.Request(self._DISCLAIMER)
1465                 try:
1466                         self.report_disclaimer()
1467                         disclaimer = urllib2.urlopen(request).read()
1468                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1469                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1470                         return
1471
1472                 # Confirm age
1473                 disclaimer_form = {
1474                         'filters': '0',
1475                         'submit': "Continue - I'm over 18",
1476                         }
1477                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1478                 try:
1479                         self.report_age_confirmation()
1480                         disclaimer = urllib2.urlopen(request).read()
1481                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1482                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1483                         return
1484
1485         def _real_extract(self, url):
1486                 # Extract id and simplified title from URL
1487                 mobj = re.match(self._VALID_URL, url)
1488                 if mobj is None:
1489                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1490                         return
1491
1492                 video_id = mobj.group(1)
1493
1494                 # Check if video comes from YouTube
1495                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1496                 if mobj2 is not None:
1497                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1498                         return
1499
1500                 # At this point we have a new video
1501                 self._downloader.increment_downloads()
1502
1503                 simple_title = mobj.group(2).decode('utf-8')
1504
1505                 # Retrieve video webpage to extract further information
1506                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1507                 try:
1508                         self.report_download_webpage(video_id)
1509                         webpage = urllib2.urlopen(request).read()
1510                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1511                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1512                         return
1513
1514                 # Extract URL, uploader and title from webpage
1515                 self.report_extraction(video_id)
1516                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1517                 if mobj is not None:
1518                         mediaURL = urllib.unquote(mobj.group(1))
1519                         video_extension = mediaURL[-3:]
1520
1521                         # Extract gdaKey if available
1522                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1523                         if mobj is None:
1524                                 video_url = mediaURL
1525                         else:
1526                                 gdaKey = mobj.group(1)
1527                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1528                 else:
1529                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1530                         if mobj is None:
1531                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1532                                 return
1533                         vardict = parse_qs(mobj.group(1))
1534                         if 'mediaData' not in vardict:
1535                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1536                                 return
1537                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1538                         if mobj is None:
1539                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1540                                 return
1541                         mediaURL = mobj.group(1).replace('\\/', '/')
1542                         video_extension = mediaURL[-3:]
1543                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1544
1545                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1546                 if mobj is None:
1547                         self._downloader.trouble(u'ERROR: unable to extract title')
1548                         return
1549                 video_title = mobj.group(1).decode('utf-8')
1550                 video_title = sanitize_title(video_title)
1551
1552                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1553                 if mobj is None:
1554                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1555                         return
1556                 video_uploader = mobj.group(1)
1557
1558                 try:
1559                         # Process video information
1560                         self._downloader.process_info({
1561                                 'id':           video_id.decode('utf-8'),
1562                                 'url':          video_url.decode('utf-8'),
1563                                 'uploader':     video_uploader.decode('utf-8'),
1564                                 'upload_date':  u'NA',
1565                                 'title':        video_title,
1566                                 'stitle':       simple_title,
1567                                 'ext':          video_extension.decode('utf-8'),
1568                                 'format':       u'NA',
1569                                 'player_url':   None,
1570                         })
1571                 except UnavailableVideoError:
1572                         self._downloader.trouble(u'\nERROR: unable to download video')
1573
1574
1575 class DailymotionIE(InfoExtractor):
1576         """Information Extractor for Dailymotion"""
1577
1578         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1579         IE_NAME = u'dailymotion'
1580
1581         def __init__(self, downloader=None):
1582                 InfoExtractor.__init__(self, downloader)
1583
1584         def report_download_webpage(self, video_id):
1585                 """Report webpage download."""
1586                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1587
1588         def report_extraction(self, video_id):
1589                 """Report information extraction."""
1590                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1591
1592         def _real_extract(self, url):
1593                 htmlParser = HTMLParser.HTMLParser()
1594                 
1595                 # Extract id and simplified title from URL
1596                 mobj = re.match(self._VALID_URL, url)
1597                 if mobj is None:
1598                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1599                         return
1600
1601                 # At this point we have a new video
1602                 self._downloader.increment_downloads()
1603                 video_id = mobj.group(1)
1604
1605                 video_extension = 'flv'
1606
1607                 # Retrieve video webpage to extract further information
1608                 request = urllib2.Request(url)
1609                 request.add_header('Cookie', 'family_filter=off')
1610                 try:
1611                         self.report_download_webpage(video_id)
1612                         webpage = urllib2.urlopen(request).read()
1613                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1614                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1615                         return
1616
1617                 # Extract URL, uploader and title from webpage
1618                 self.report_extraction(video_id)
1619                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1620                 if mobj is None:
1621                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1622                         return
1623                 sequence = urllib.unquote(mobj.group(1))
1624                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1625                 if mobj is None:
1626                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1627                         return
1628                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1629
1630                 # if needed add http://www.dailymotion.com/ if relative URL
1631
1632                 video_url = mediaURL
1633
1634                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1635                 if mobj is None:
1636                         self._downloader.trouble(u'ERROR: unable to extract title')
1637                         return
1638                 video_title = htmlParser.unescape(mobj.group('title')).decode('utf-8')
1639                 video_title = sanitize_title(video_title)
1640                 simple_title = _simplify_title(video_title)
1641
1642                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1643                 if mobj is None:
1644                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1645                         return
1646                 video_uploader = mobj.group(1)
1647
1648                 try:
1649                         # Process video information
1650                         self._downloader.process_info({
1651                                 'id':           video_id.decode('utf-8'),
1652                                 'url':          video_url.decode('utf-8'),
1653                                 'uploader':     video_uploader.decode('utf-8'),
1654                                 'upload_date':  u'NA',
1655                                 'title':        video_title,
1656                                 'stitle':       simple_title,
1657                                 'ext':          video_extension.decode('utf-8'),
1658                                 'format':       u'NA',
1659                                 'player_url':   None,
1660                         })
1661                 except UnavailableVideoError:
1662                         self._downloader.trouble(u'\nERROR: unable to download video')
1663
1664
1665 class GoogleIE(InfoExtractor):
1666         """Information extractor for video.google.com."""
1667
1668         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1669         IE_NAME = u'video.google'
1670
1671         def __init__(self, downloader=None):
1672                 InfoExtractor.__init__(self, downloader)
1673
1674         def report_download_webpage(self, video_id):
1675                 """Report webpage download."""
1676                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1677
1678         def report_extraction(self, video_id):
1679                 """Report information extraction."""
1680                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1681
1682         def _real_extract(self, url):
1683                 # Extract id from URL
1684                 mobj = re.match(self._VALID_URL, url)
1685                 if mobj is None:
1686                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1687                         return
1688
1689                 # At this point we have a new video
1690                 self._downloader.increment_downloads()
1691                 video_id = mobj.group(1)
1692
1693                 video_extension = 'mp4'
1694
1695                 # Retrieve video webpage to extract further information
1696                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1697                 try:
1698                         self.report_download_webpage(video_id)
1699                         webpage = urllib2.urlopen(request).read()
1700                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1701                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1702                         return
1703
1704                 # Extract URL, uploader, and title from webpage
1705                 self.report_extraction(video_id)
1706                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1707                 if mobj is None:
1708                         video_extension = 'flv'
1709                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1710                 if mobj is None:
1711                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1712                         return
1713                 mediaURL = urllib.unquote(mobj.group(1))
1714                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1715                 mediaURL = mediaURL.replace('\\x26', '\x26')
1716
1717                 video_url = mediaURL
1718
1719                 mobj = re.search(r'<title>(.*)</title>', webpage)
1720                 if mobj is None:
1721                         self._downloader.trouble(u'ERROR: unable to extract title')
1722                         return
1723                 video_title = mobj.group(1).decode('utf-8')
1724                 video_title = sanitize_title(video_title)
1725                 simple_title = _simplify_title(video_title)
1726
1727                 # Extract video description
1728                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1729                 if mobj is None:
1730                         self._downloader.trouble(u'ERROR: unable to extract video description')
1731                         return
1732                 video_description = mobj.group(1).decode('utf-8')
1733                 if not video_description:
1734                         video_description = 'No description available.'
1735
1736                 # Extract video thumbnail
1737                 if self._downloader.params.get('forcethumbnail', False):
1738                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1739                         try:
1740                                 webpage = urllib2.urlopen(request).read()
1741                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1742                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1743                                 return
1744                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1745                         if mobj is None:
1746                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1747                                 return
1748                         video_thumbnail = mobj.group(1)
1749                 else:   # we need something to pass to process_info
1750                         video_thumbnail = ''
1751
1752                 try:
1753                         # Process video information
1754                         self._downloader.process_info({
1755                                 'id':           video_id.decode('utf-8'),
1756                                 'url':          video_url.decode('utf-8'),
1757                                 'uploader':     u'NA',
1758                                 'upload_date':  u'NA',
1759                                 'title':        video_title,
1760                                 'stitle':       simple_title,
1761                                 'ext':          video_extension.decode('utf-8'),
1762                                 'format':       u'NA',
1763                                 'player_url':   None,
1764                         })
1765                 except UnavailableVideoError:
1766                         self._downloader.trouble(u'\nERROR: unable to download video')
1767
1768
1769 class PhotobucketIE(InfoExtractor):
1770         """Information extractor for photobucket.com."""
1771
1772         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1773         IE_NAME = u'photobucket'
1774
1775         def __init__(self, downloader=None):
1776                 InfoExtractor.__init__(self, downloader)
1777
1778         def report_download_webpage(self, video_id):
1779                 """Report webpage download."""
1780                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1781
1782         def report_extraction(self, video_id):
1783                 """Report information extraction."""
1784                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1785
1786         def _real_extract(self, url):
1787                 # Extract id from URL
1788                 mobj = re.match(self._VALID_URL, url)
1789                 if mobj is None:
1790                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1791                         return
1792
1793                 # At this point we have a new video
1794                 self._downloader.increment_downloads()
1795                 video_id = mobj.group(1)
1796
1797                 video_extension = 'flv'
1798
1799                 # Retrieve video webpage to extract further information
1800                 request = urllib2.Request(url)
1801                 try:
1802                         self.report_download_webpage(video_id)
1803                         webpage = urllib2.urlopen(request).read()
1804                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1805                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1806                         return
1807
1808                 # Extract URL, uploader, and title from webpage
1809                 self.report_extraction(video_id)
1810                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1811                 if mobj is None:
1812                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1813                         return
1814                 mediaURL = urllib.unquote(mobj.group(1))
1815
1816                 video_url = mediaURL
1817
1818                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1819                 if mobj is None:
1820                         self._downloader.trouble(u'ERROR: unable to extract title')
1821                         return
1822                 video_title = mobj.group(1).decode('utf-8')
1823                 video_title = sanitize_title(video_title)
1824                 simple_title = _simplify_title(vide_title)
1825
1826                 video_uploader = mobj.group(2).decode('utf-8')
1827
1828                 try:
1829                         # Process video information
1830                         self._downloader.process_info({
1831                                 'id':           video_id.decode('utf-8'),
1832                                 'url':          video_url.decode('utf-8'),
1833                                 'uploader':     video_uploader,
1834                                 'upload_date':  u'NA',
1835                                 'title':        video_title,
1836                                 'stitle':       simple_title,
1837                                 'ext':          video_extension.decode('utf-8'),
1838                                 'format':       u'NA',
1839                                 'player_url':   None,
1840                         })
1841                 except UnavailableVideoError:
1842                         self._downloader.trouble(u'\nERROR: unable to download video')
1843
1844
1845 class YahooIE(InfoExtractor):
1846         """Information extractor for video.yahoo.com."""
1847
1848         # _VALID_URL matches all Yahoo! Video URLs
1849         # _VPAGE_URL matches only the extractable '/watch/' URLs
1850         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1851         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1852         IE_NAME = u'video.yahoo'
1853
1854         def __init__(self, downloader=None):
1855                 InfoExtractor.__init__(self, downloader)
1856
1857         def report_download_webpage(self, video_id):
1858                 """Report webpage download."""
1859                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1860
1861         def report_extraction(self, video_id):
1862                 """Report information extraction."""
1863                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1864
1865         def _real_extract(self, url, new_video=True):
1866                 # Extract ID from URL
1867                 mobj = re.match(self._VALID_URL, url)
1868                 if mobj is None:
1869                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1870                         return
1871
1872                 # At this point we have a new video
1873                 self._downloader.increment_downloads()
1874                 video_id = mobj.group(2)
1875                 video_extension = 'flv'
1876
1877                 # Rewrite valid but non-extractable URLs as
1878                 # extractable English language /watch/ URLs
1879                 if re.match(self._VPAGE_URL, url) is None:
1880                         request = urllib2.Request(url)
1881                         try:
1882                                 webpage = urllib2.urlopen(request).read()
1883                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1884                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1885                                 return
1886
1887                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1888                         if mobj is None:
1889                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1890                                 return
1891                         yahoo_id = mobj.group(1)
1892
1893                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1894                         if mobj is None:
1895                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1896                                 return
1897                         yahoo_vid = mobj.group(1)
1898
1899                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1900                         return self._real_extract(url, new_video=False)
1901
1902                 # Retrieve video webpage to extract further information
1903                 request = urllib2.Request(url)
1904                 try:
1905                         self.report_download_webpage(video_id)
1906                         webpage = urllib2.urlopen(request).read()
1907                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1908                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1909                         return
1910
1911                 # Extract uploader and title from webpage
1912                 self.report_extraction(video_id)
1913                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1914                 if mobj is None:
1915                         self._downloader.trouble(u'ERROR: unable to extract video title')
1916                         return
1917                 video_title = mobj.group(1).decode('utf-8')
1918                 simple_title = _simplify_title(video_title)
1919
1920                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1921                 if mobj is None:
1922                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1923                         return
1924                 video_uploader = mobj.group(1).decode('utf-8')
1925
1926                 # Extract video thumbnail
1927                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1928                 if mobj is None:
1929                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1930                         return
1931                 video_thumbnail = mobj.group(1).decode('utf-8')
1932
1933                 # Extract video description
1934                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1935                 if mobj is None:
1936                         self._downloader.trouble(u'ERROR: unable to extract video description')
1937                         return
1938                 video_description = mobj.group(1).decode('utf-8')
1939                 if not video_description:
1940                         video_description = 'No description available.'
1941
1942                 # Extract video height and width
1943                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1944                 if mobj is None:
1945                         self._downloader.trouble(u'ERROR: unable to extract video height')
1946                         return
1947                 yv_video_height = mobj.group(1)
1948
1949                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1950                 if mobj is None:
1951                         self._downloader.trouble(u'ERROR: unable to extract video width')
1952                         return
1953                 yv_video_width = mobj.group(1)
1954
1955                 # Retrieve video playlist to extract media URL
1956                 # I'm not completely sure what all these options are, but we
1957                 # seem to need most of them, otherwise the server sends a 401.
1958                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1959                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1960                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1961                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1962                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1963                 try:
1964                         self.report_download_webpage(video_id)
1965                         webpage = urllib2.urlopen(request).read()
1966                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1967                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1968                         return
1969
1970                 # Extract media URL from playlist XML
1971                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1972                 if mobj is None:
1973                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1974                         return
1975                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1976                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1977
1978                 try:
1979                         # Process video information
1980                         self._downloader.process_info({
1981                                 'id':           video_id.decode('utf-8'),
1982                                 'url':          video_url,
1983                                 'uploader':     video_uploader,
1984                                 'upload_date':  u'NA',
1985                                 'title':        video_title,
1986                                 'stitle':       simple_title,
1987                                 'ext':          video_extension.decode('utf-8'),
1988                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1989                                 'description':  video_description,
1990                                 'thumbnail':    video_thumbnail,
1991                                 'player_url':   None,
1992                         })
1993                 except UnavailableVideoError:
1994                         self._downloader.trouble(u'\nERROR: unable to download video')
1995
1996
1997 class VimeoIE(InfoExtractor):
1998         """Information extractor for vimeo.com."""
1999
2000         # _VALID_URL matches Vimeo URLs
2001         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2002         IE_NAME = u'vimeo'
2003
2004         def __init__(self, downloader=None):
2005                 InfoExtractor.__init__(self, downloader)
2006
2007         def report_download_webpage(self, video_id):
2008                 """Report webpage download."""
2009                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2010
2011         def report_extraction(self, video_id):
2012                 """Report information extraction."""
2013                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2014
2015         def _real_extract(self, url, new_video=True):
2016                 # Extract ID from URL
2017                 mobj = re.match(self._VALID_URL, url)
2018                 if mobj is None:
2019                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2020                         return
2021
2022                 # At this point we have a new video
2023                 self._downloader.increment_downloads()
2024                 video_id = mobj.group(1)
2025
2026                 # Retrieve video webpage to extract further information
2027                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2028                 try:
2029                         self.report_download_webpage(video_id)
2030                         webpage = urllib2.urlopen(request).read()
2031                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2032                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2033                         return
2034
2035                 # Now we begin extracting as much information as we can from what we
2036                 # retrieved. First we extract the information common to all extractors,
2037                 # and latter we extract those that are Vimeo specific.
2038                 self.report_extraction(video_id)
2039
2040                 # Extract title
2041                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2042                 if mobj is None:
2043                         self._downloader.trouble(u'ERROR: unable to extract video title')
2044                         return
2045                 video_title = mobj.group(1).decode('utf-8')
2046                 simple_title = _simplify_title(video_title)
2047
2048                 # Extract uploader
2049                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2050                 if mobj is None:
2051                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2052                         return
2053                 video_uploader = mobj.group(1).decode('utf-8')
2054
2055                 # Extract video thumbnail
2056                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2057                 if mobj is None:
2058                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2059                         return
2060                 video_thumbnail = mobj.group(1).decode('utf-8')
2061
2062                 # # Extract video description
2063                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2064                 # if mobj is None:
2065                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2066                 #       return
2067                 # video_description = mobj.group(1).decode('utf-8')
2068                 # if not video_description: video_description = 'No description available.'
2069                 video_description = 'Foo.'
2070
2071                 # Vimeo specific: extract request signature
2072                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2073                 if mobj is None:
2074                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2075                         return
2076                 sig = mobj.group(1).decode('utf-8')
2077
2078                 # Vimeo specific: extract video quality information
2079                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2080                 if mobj is None:
2081                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2082                         return
2083                 quality = mobj.group(1).decode('utf-8')
2084
2085                 if int(quality) == 1:
2086                         quality = 'hd'
2087                 else:
2088                         quality = 'sd'
2089
2090                 # Vimeo specific: Extract request signature expiration
2091                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2092                 if mobj is None:
2093                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2094                         return
2095                 sig_exp = mobj.group(1).decode('utf-8')
2096
2097                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2098
2099                 try:
2100                         # Process video information
2101                         self._downloader.process_info({
2102                                 'id':           video_id.decode('utf-8'),
2103                                 'url':          video_url,
2104                                 'uploader':     video_uploader,
2105                                 'upload_date':  u'NA',
2106                                 'title':        video_title,
2107                                 'stitle':       simple_title,
2108                                 'ext':          u'mp4',
2109                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2110                                 'description':  video_description,
2111                                 'thumbnail':    video_thumbnail,
2112                                 'description':  video_description,
2113                                 'player_url':   None,
2114                         })
2115                 except UnavailableVideoError:
2116                         self._downloader.trouble(u'ERROR: unable to download video')
2117
2118
2119 class GenericIE(InfoExtractor):
2120         """Generic last-resort information extractor."""
2121
2122         _VALID_URL = r'.*'
2123         IE_NAME = u'generic'
2124
2125         def __init__(self, downloader=None):
2126                 InfoExtractor.__init__(self, downloader)
2127
2128         def report_download_webpage(self, video_id):
2129                 """Report webpage download."""
2130                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2131                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2132
2133         def report_extraction(self, video_id):
2134                 """Report information extraction."""
2135                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2136
2137         def _real_extract(self, url):
2138                 # At this point we have a new video
2139                 self._downloader.increment_downloads()
2140
2141                 video_id = url.split('/')[-1]
2142                 request = urllib2.Request(url)
2143                 try:
2144                         self.report_download_webpage(video_id)
2145                         webpage = urllib2.urlopen(request).read()
2146                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2148                         return
2149                 except ValueError, err:
2150                         # since this is the last-resort InfoExtractor, if
2151                         # this error is thrown, it'll be thrown here
2152                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2153                         return
2154
2155                 self.report_extraction(video_id)
2156                 # Start with something easy: JW Player in SWFObject
2157                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2158                 if mobj is None:
2159                         # Broaden the search a little bit
2160                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2161                 if mobj is None:
2162                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2163                         return
2164
2165                 # It's possible that one of the regexes
2166                 # matched, but returned an empty group:
2167                 if mobj.group(1) is None:
2168                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2169                         return
2170
2171                 video_url = urllib.unquote(mobj.group(1))
2172                 video_id = os.path.basename(video_url)
2173
2174                 # here's a fun little line of code for you:
2175                 video_extension = os.path.splitext(video_id)[1][1:]
2176                 video_id = os.path.splitext(video_id)[0]
2177
2178                 # it's tempting to parse this further, but you would
2179                 # have to take into account all the variations like
2180                 #   Video Title - Site Name
2181                 #   Site Name | Video Title
2182                 #   Video Title - Tagline | Site Name
2183                 # and so on and so forth; it's just not practical
2184                 mobj = re.search(r'<title>(.*)</title>', webpage)
2185                 if mobj is None:
2186                         self._downloader.trouble(u'ERROR: unable to extract title')
2187                         return
2188                 video_title = mobj.group(1).decode('utf-8')
2189                 video_title = sanitize_title(video_title)
2190                 simple_title = _simplify_title(video_title)
2191
2192                 # video uploader is domain name
2193                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2194                 if mobj is None:
2195                         self._downloader.trouble(u'ERROR: unable to extract title')
2196                         return
2197                 video_uploader = mobj.group(1).decode('utf-8')
2198
2199                 try:
2200                         # Process video information
2201                         self._downloader.process_info({
2202                                 'id':           video_id.decode('utf-8'),
2203                                 'url':          video_url.decode('utf-8'),
2204                                 'uploader':     video_uploader,
2205                                 'upload_date':  u'NA',
2206                                 'title':        video_title,
2207                                 'stitle':       simple_title,
2208                                 'ext':          video_extension.decode('utf-8'),
2209                                 'format':       u'NA',
2210                                 'player_url':   None,
2211                         })
2212                 except UnavailableVideoError, err:
2213                         self._downloader.trouble(u'\nERROR: unable to download video')
2214
2215
2216 class YoutubeSearchIE(InfoExtractor):
2217         """Information Extractor for YouTube search queries."""
2218         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2219         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2220         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2221         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2222         _youtube_ie = None
2223         _max_youtube_results = 1000
2224         IE_NAME = u'youtube:search'
2225
2226         def __init__(self, youtube_ie, downloader=None):
2227                 InfoExtractor.__init__(self, downloader)
2228                 self._youtube_ie = youtube_ie
2229
2230         def report_download_page(self, query, pagenum):
2231                 """Report attempt to download playlist page with given number."""
2232                 query = query.decode(preferredencoding())
2233                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2234
2235         def _real_initialize(self):
2236                 self._youtube_ie.initialize()
2237
2238         def _real_extract(self, query):
2239                 mobj = re.match(self._VALID_URL, query)
2240                 if mobj is None:
2241                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2242                         return
2243
2244                 prefix, query = query.split(':')
2245                 prefix = prefix[8:]
2246                 query = query.encode('utf-8')
2247                 if prefix == '':
2248                         self._download_n_results(query, 1)
2249                         return
2250                 elif prefix == 'all':
2251                         self._download_n_results(query, self._max_youtube_results)
2252                         return
2253                 else:
2254                         try:
2255                                 n = long(prefix)
2256                                 if n <= 0:
2257                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2258                                         return
2259                                 elif n > self._max_youtube_results:
2260                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2261                                         n = self._max_youtube_results
2262                                 self._download_n_results(query, n)
2263                                 return
2264                         except ValueError: # parsing prefix as integer fails
2265                                 self._download_n_results(query, 1)
2266                                 return
2267
2268         def _download_n_results(self, query, n):
2269                 """Downloads a specified number of results for a query"""
2270
2271                 video_ids = []
2272                 already_seen = set()
2273                 pagenum = 1
2274
2275                 while True:
2276                         self.report_download_page(query, pagenum)
2277                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2278                         request = urllib2.Request(result_url)
2279                         try:
2280                                 page = urllib2.urlopen(request).read()
2281                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2282                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2283                                 return
2284
2285                         # Extract video identifiers
2286                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2287                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2288                                 if video_id not in already_seen:
2289                                         video_ids.append(video_id)
2290                                         already_seen.add(video_id)
2291                                         if len(video_ids) == n:
2292                                                 # Specified n videos reached
2293                                                 for id in video_ids:
2294                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2295                                                 return
2296
2297                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2298                                 for id in video_ids:
2299                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2300                                 return
2301
2302                         pagenum = pagenum + 1
2303
2304
2305 class GoogleSearchIE(InfoExtractor):
2306         """Information Extractor for Google Video search queries."""
2307         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2308         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2309         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2310         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2311         _google_ie = None
2312         _max_google_results = 1000
2313         IE_NAME = u'video.google:search'
2314
2315         def __init__(self, google_ie, downloader=None):
2316                 InfoExtractor.__init__(self, downloader)
2317                 self._google_ie = google_ie
2318
2319         def report_download_page(self, query, pagenum):
2320                 """Report attempt to download playlist page with given number."""
2321                 query = query.decode(preferredencoding())
2322                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2323
2324         def _real_initialize(self):
2325                 self._google_ie.initialize()
2326
2327         def _real_extract(self, query):
2328                 mobj = re.match(self._VALID_URL, query)
2329                 if mobj is None:
2330                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2331                         return
2332
2333                 prefix, query = query.split(':')
2334                 prefix = prefix[8:]
2335                 query = query.encode('utf-8')
2336                 if prefix == '':
2337                         self._download_n_results(query, 1)
2338                         return
2339                 elif prefix == 'all':
2340                         self._download_n_results(query, self._max_google_results)
2341                         return
2342                 else:
2343                         try:
2344                                 n = long(prefix)
2345                                 if n <= 0:
2346                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2347                                         return
2348                                 elif n > self._max_google_results:
2349                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2350                                         n = self._max_google_results
2351                                 self._download_n_results(query, n)
2352                                 return
2353                         except ValueError: # parsing prefix as integer fails
2354                                 self._download_n_results(query, 1)
2355                                 return
2356
2357         def _download_n_results(self, query, n):
2358                 """Downloads a specified number of results for a query"""
2359
2360                 video_ids = []
2361                 already_seen = set()
2362                 pagenum = 1
2363
2364                 while True:
2365                         self.report_download_page(query, pagenum)
2366                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2367                         request = urllib2.Request(result_url)
2368                         try:
2369                                 page = urllib2.urlopen(request).read()
2370                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2371                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2372                                 return
2373
2374                         # Extract video identifiers
2375                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2376                                 video_id = mobj.group(1)
2377                                 if video_id not in already_seen:
2378                                         video_ids.append(video_id)
2379                                         already_seen.add(video_id)
2380                                         if len(video_ids) == n:
2381                                                 # Specified n videos reached
2382                                                 for id in video_ids:
2383                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2384                                                 return
2385
2386                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2387                                 for id in video_ids:
2388                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2389                                 return
2390
2391                         pagenum = pagenum + 1
2392
2393
2394 class YahooSearchIE(InfoExtractor):
2395         """Information Extractor for Yahoo! Video search queries."""
2396         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2397         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2398         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2399         _MORE_PAGES_INDICATOR = r'\s*Next'
2400         _yahoo_ie = None
2401         _max_yahoo_results = 1000
2402         IE_NAME = u'video.yahoo:search'
2403
2404         def __init__(self, yahoo_ie, downloader=None):
2405                 InfoExtractor.__init__(self, downloader)
2406                 self._yahoo_ie = yahoo_ie
2407
2408         def report_download_page(self, query, pagenum):
2409                 """Report attempt to download playlist page with given number."""
2410                 query = query.decode(preferredencoding())
2411                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2412
2413         def _real_initialize(self):
2414                 self._yahoo_ie.initialize()
2415
2416         def _real_extract(self, query):
2417                 mobj = re.match(self._VALID_URL, query)
2418                 if mobj is None:
2419                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2420                         return
2421
2422                 prefix, query = query.split(':')
2423                 prefix = prefix[8:]
2424                 query = query.encode('utf-8')
2425                 if prefix == '':
2426                         self._download_n_results(query, 1)
2427                         return
2428                 elif prefix == 'all':
2429                         self._download_n_results(query, self._max_yahoo_results)
2430                         return
2431                 else:
2432                         try:
2433                                 n = long(prefix)
2434                                 if n <= 0:
2435                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2436                                         return
2437                                 elif n > self._max_yahoo_results:
2438                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2439                                         n = self._max_yahoo_results
2440                                 self._download_n_results(query, n)
2441                                 return
2442                         except ValueError: # parsing prefix as integer fails
2443                                 self._download_n_results(query, 1)
2444                                 return
2445
2446         def _download_n_results(self, query, n):
2447                 """Downloads a specified number of results for a query"""
2448
2449                 video_ids = []
2450                 already_seen = set()
2451                 pagenum = 1
2452
2453                 while True:
2454                         self.report_download_page(query, pagenum)
2455                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2456                         request = urllib2.Request(result_url)
2457                         try:
2458                                 page = urllib2.urlopen(request).read()
2459                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2461                                 return
2462
2463                         # Extract video identifiers
2464                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2465                                 video_id = mobj.group(1)
2466                                 if video_id not in already_seen:
2467                                         video_ids.append(video_id)
2468                                         already_seen.add(video_id)
2469                                         if len(video_ids) == n:
2470                                                 # Specified n videos reached
2471                                                 for id in video_ids:
2472                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2473                                                 return
2474
2475                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2476                                 for id in video_ids:
2477                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2478                                 return
2479
2480                         pagenum = pagenum + 1
2481
2482
2483 class YoutubePlaylistIE(InfoExtractor):
2484         """Information Extractor for YouTube playlists."""
2485
2486         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2487         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2488         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2489         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2490         _youtube_ie = None
2491         IE_NAME = u'youtube:playlist'
2492
2493         def __init__(self, youtube_ie, downloader=None):
2494                 InfoExtractor.__init__(self, downloader)
2495                 self._youtube_ie = youtube_ie
2496
2497         def report_download_page(self, playlist_id, pagenum):
2498                 """Report attempt to download playlist page with given number."""
2499                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2500
2501         def _real_initialize(self):
2502                 self._youtube_ie.initialize()
2503
2504         def _real_extract(self, url):
2505                 # Extract playlist id
2506                 mobj = re.match(self._VALID_URL, url)
2507                 if mobj is None:
2508                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2509                         return
2510
2511                 # Single video case
2512                 if mobj.group(3) is not None:
2513                         self._youtube_ie.extract(mobj.group(3))
2514                         return
2515
2516                 # Download playlist pages
2517                 # prefix is 'p' as default for playlists but there are other types that need extra care
2518                 playlist_prefix = mobj.group(1)
2519                 if playlist_prefix == 'a':
2520                         playlist_access = 'artist'
2521                 else:
2522                         playlist_prefix = 'p'
2523                         playlist_access = 'view_play_list'
2524                 playlist_id = mobj.group(2)
2525                 video_ids = []
2526                 pagenum = 1
2527
2528                 while True:
2529                         self.report_download_page(playlist_id, pagenum)
2530                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2531                         request = urllib2.Request(url)
2532                         try:
2533                                 page = urllib2.urlopen(request).read()
2534                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2536                                 return
2537
2538                         # Extract video identifiers
2539                         ids_in_page = []
2540                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2541                                 if mobj.group(1) not in ids_in_page:
2542                                         ids_in_page.append(mobj.group(1))
2543                         video_ids.extend(ids_in_page)
2544
2545                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2546                                 break
2547                         pagenum = pagenum + 1
2548
2549                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2550                 playlistend = self._downloader.params.get('playlistend', -1)
2551                 video_ids = video_ids[playliststart:playlistend]
2552
2553                 for id in video_ids:
2554                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2555                 return
2556
2557
2558 class YoutubeUserIE(InfoExtractor):
2559         """Information Extractor for YouTube users."""
2560
2561         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2562         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2563         _GDATA_PAGE_SIZE = 50
2564         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2565         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2566         _youtube_ie = None
2567         IE_NAME = u'youtube:user'
2568
2569         def __init__(self, youtube_ie, downloader=None):
2570                 InfoExtractor.__init__(self, downloader)
2571                 self._youtube_ie = youtube_ie
2572
2573         def report_download_page(self, username, start_index):
2574                 """Report attempt to download user page."""
2575                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2576                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2577
2578         def _real_initialize(self):
2579                 self._youtube_ie.initialize()
2580
2581         def _real_extract(self, url):
2582                 # Extract username
2583                 mobj = re.match(self._VALID_URL, url)
2584                 if mobj is None:
2585                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2586                         return
2587
2588                 username = mobj.group(1)
2589
2590                 # Download video ids using YouTube Data API. Result size per
2591                 # query is limited (currently to 50 videos) so we need to query
2592                 # page by page until there are no video ids - it means we got
2593                 # all of them.
2594
2595                 video_ids = []
2596                 pagenum = 0
2597
2598                 while True:
2599                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2600                         self.report_download_page(username, start_index)
2601
2602                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2603
2604                         try:
2605                                 page = urllib2.urlopen(request).read()
2606                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2607                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2608                                 return
2609
2610                         # Extract video identifiers
2611                         ids_in_page = []
2612
2613                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2614                                 if mobj.group(1) not in ids_in_page:
2615                                         ids_in_page.append(mobj.group(1))
2616
2617                         video_ids.extend(ids_in_page)
2618
2619                         # A little optimization - if current page is not
2620                         # "full", ie. does not contain PAGE_SIZE video ids then
2621                         # we can assume that this page is the last one - there
2622                         # are no more ids on further pages - no need to query
2623                         # again.
2624
2625                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2626                                 break
2627
2628                         pagenum += 1
2629
2630                 all_ids_count = len(video_ids)
2631                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2632                 playlistend = self._downloader.params.get('playlistend', -1)
2633
2634                 if playlistend == -1:
2635                         video_ids = video_ids[playliststart:]
2636                 else:
2637                         video_ids = video_ids[playliststart:playlistend]
2638
2639                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2640                                 (username, all_ids_count, len(video_ids)))
2641
2642                 for video_id in video_ids:
2643                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2644
2645
2646 class DepositFilesIE(InfoExtractor):
2647         """Information extractor for depositfiles.com"""
2648
2649         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2650         IE_NAME = u'DepositFiles'
2651
2652         def __init__(self, downloader=None):
2653                 InfoExtractor.__init__(self, downloader)
2654
2655         def report_download_webpage(self, file_id):
2656                 """Report webpage download."""
2657                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2658
2659         def report_extraction(self, file_id):
2660                 """Report information extraction."""
2661                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2662
2663         def _real_extract(self, url):
2664                 # At this point we have a new file
2665                 self._downloader.increment_downloads()
2666
2667                 file_id = url.split('/')[-1]
2668                 # Rebuild url in english locale
2669                 url = 'http://depositfiles.com/en/files/' + file_id
2670
2671                 # Retrieve file webpage with 'Free download' button pressed
2672                 free_download_indication = { 'gateway_result' : '1' }
2673                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2674                 try:
2675                         self.report_download_webpage(file_id)
2676                         webpage = urllib2.urlopen(request).read()
2677                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2678                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2679                         return
2680
2681                 # Search for the real file URL
2682                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2683                 if (mobj is None) or (mobj.group(1) is None):
2684                         # Try to figure out reason of the error.
2685                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2686                         if (mobj is not None) and (mobj.group(1) is not None):
2687                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2688                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2689                         else:
2690                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2691                         return
2692
2693                 file_url = mobj.group(1)
2694                 file_extension = os.path.splitext(file_url)[1][1:]
2695
2696                 # Search for file title
2697                 mobj = re.search(r'<b title="(.*?)">', webpage)
2698                 if mobj is None:
2699                         self._downloader.trouble(u'ERROR: unable to extract title')
2700                         return
2701                 file_title = mobj.group(1).decode('utf-8')
2702
2703                 try:
2704                         # Process file information
2705                         self._downloader.process_info({
2706                                 'id':           file_id.decode('utf-8'),
2707                                 'url':          file_url.decode('utf-8'),
2708                                 'uploader':     u'NA',
2709                                 'upload_date':  u'NA',
2710                                 'title':        file_title,
2711                                 'stitle':       file_title,
2712                                 'ext':          file_extension.decode('utf-8'),
2713                                 'format':       u'NA',
2714                                 'player_url':   None,
2715                         })
2716                 except UnavailableVideoError, err:
2717                         self._downloader.trouble(u'ERROR: unable to download file')
2718
2719
2720 class FacebookIE(InfoExtractor):
2721         """Information Extractor for Facebook"""
2722
2723         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2724         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2725         _NETRC_MACHINE = 'facebook'
2726         _available_formats = ['video', 'highqual', 'lowqual']
2727         _video_extensions = {
2728                 'video': 'mp4',
2729                 'highqual': 'mp4',
2730                 'lowqual': 'mp4',
2731         }
2732         IE_NAME = u'facebook'
2733
2734         def __init__(self, downloader=None):
2735                 InfoExtractor.__init__(self, downloader)
2736
2737         def _reporter(self, message):
2738                 """Add header and report message."""
2739                 self._downloader.to_screen(u'[facebook] %s' % message)
2740
2741         def report_login(self):
2742                 """Report attempt to log in."""
2743                 self._reporter(u'Logging in')
2744
2745         def report_video_webpage_download(self, video_id):
2746                 """Report attempt to download video webpage."""
2747                 self._reporter(u'%s: Downloading video webpage' % video_id)
2748
2749         def report_information_extraction(self, video_id):
2750                 """Report attempt to extract video information."""
2751                 self._reporter(u'%s: Extracting video information' % video_id)
2752
2753         def _parse_page(self, video_webpage):
2754                 """Extract video information from page"""
2755                 # General data
2756                 data = {'title': r'\("video_title", "(.*?)"\)',
2757                         'description': r'<div class="datawrap">(.*?)</div>',
2758                         'owner': r'\("video_owner_name", "(.*?)"\)',
2759                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2760                         }
2761                 video_info = {}
2762                 for piece in data.keys():
2763                         mobj = re.search(data[piece], video_webpage)
2764                         if mobj is not None:
2765                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2766
2767                 # Video urls
2768                 video_urls = {}
2769                 for fmt in self._available_formats:
2770                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2771                         if mobj is not None:
2772                                 # URL is in a Javascript segment inside an escaped Unicode format within
2773                                 # the generally utf-8 page
2774                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2775                 video_info['video_urls'] = video_urls
2776
2777                 return video_info
2778
2779         def _real_initialize(self):
2780                 if self._downloader is None:
2781                         return
2782
2783                 useremail = None
2784                 password = None
2785                 downloader_params = self._downloader.params
2786
2787                 # Attempt to use provided username and password or .netrc data
2788                 if downloader_params.get('username', None) is not None:
2789                         useremail = downloader_params['username']
2790                         password = downloader_params['password']
2791                 elif downloader_params.get('usenetrc', False):
2792                         try:
2793                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2794                                 if info is not None:
2795                                         useremail = info[0]
2796                                         password = info[2]
2797                                 else:
2798                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2799                         except (IOError, netrc.NetrcParseError), err:
2800                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2801                                 return
2802
2803                 if useremail is None:
2804                         return
2805
2806                 # Log in
2807                 login_form = {
2808                         'email': useremail,
2809                         'pass': password,
2810                         'login': 'Log+In'
2811                         }
2812                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2813                 try:
2814                         self.report_login()
2815                         login_results = urllib2.urlopen(request).read()
2816                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2817                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2818                                 return
2819                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2820                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2821                         return
2822
2823         def _real_extract(self, url):
2824                 mobj = re.match(self._VALID_URL, url)
2825                 if mobj is None:
2826                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2827                         return
2828                 video_id = mobj.group('ID')
2829
2830                 # Get video webpage
2831                 self.report_video_webpage_download(video_id)
2832                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2833                 try:
2834                         page = urllib2.urlopen(request)
2835                         video_webpage = page.read()
2836                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2837                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2838                         return
2839
2840                 # Start extracting information
2841                 self.report_information_extraction(video_id)
2842
2843                 # Extract information
2844                 video_info = self._parse_page(video_webpage)
2845
2846                 # uploader
2847                 if 'owner' not in video_info:
2848                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2849                         return
2850                 video_uploader = video_info['owner']
2851
2852                 # title
2853                 if 'title' not in video_info:
2854                         self._downloader.trouble(u'ERROR: unable to extract video title')
2855                         return
2856                 video_title = video_info['title']
2857                 video_title = video_title.decode('utf-8')
2858                 video_title = sanitize_title(video_title)
2859
2860                 simple_title = _simplify_title(video_title)
2861
2862                 # thumbnail image
2863                 if 'thumbnail' not in video_info:
2864                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2865                         video_thumbnail = ''
2866                 else:
2867                         video_thumbnail = video_info['thumbnail']
2868
2869                 # upload date
2870                 upload_date = u'NA'
2871                 if 'upload_date' in video_info:
2872                         upload_time = video_info['upload_date']
2873                         timetuple = email.utils.parsedate_tz(upload_time)
2874                         if timetuple is not None:
2875                                 try:
2876                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2877                                 except:
2878                                         pass
2879
2880                 # description
2881                 video_description = video_info.get('description', 'No description available.')
2882
2883                 url_map = video_info['video_urls']
2884                 if len(url_map.keys()) > 0:
2885                         # Decide which formats to download
2886                         req_format = self._downloader.params.get('format', None)
2887                         format_limit = self._downloader.params.get('format_limit', None)
2888
2889                         if format_limit is not None and format_limit in self._available_formats:
2890                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2891                         else:
2892                                 format_list = self._available_formats
2893                         existing_formats = [x for x in format_list if x in url_map]
2894                         if len(existing_formats) == 0:
2895                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2896                                 return
2897                         if req_format is None:
2898                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2899                         elif req_format == 'worst':
2900                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2901                         elif req_format == '-1':
2902                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2903                         else:
2904                                 # Specific format
2905                                 if req_format not in url_map:
2906                                         self._downloader.trouble(u'ERROR: requested format not available')
2907                                         return
2908                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2909
2910                 for format_param, video_real_url in video_url_list:
2911
2912                         # At this point we have a new video
2913                         self._downloader.increment_downloads()
2914
2915                         # Extension
2916                         video_extension = self._video_extensions.get(format_param, 'mp4')
2917
2918                         try:
2919                                 # Process video information
2920                                 self._downloader.process_info({
2921                                         'id':           video_id.decode('utf-8'),
2922                                         'url':          video_real_url.decode('utf-8'),
2923                                         'uploader':     video_uploader.decode('utf-8'),
2924                                         'upload_date':  upload_date,
2925                                         'title':        video_title,
2926                                         'stitle':       simple_title,
2927                                         'ext':          video_extension.decode('utf-8'),
2928                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2929                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2930                                         'description':  video_description.decode('utf-8'),
2931                                         'player_url':   None,
2932                                 })
2933                         except UnavailableVideoError, err:
2934                                 self._downloader.trouble(u'\nERROR: unable to download video')
2935
2936 class BlipTVIE(InfoExtractor):
2937         """Information extractor for blip.tv"""
2938
2939         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2940         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2941         IE_NAME = u'blip.tv'
2942
2943         def report_extraction(self, file_id):
2944                 """Report information extraction."""
2945                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2946
2947         def report_direct_download(self, title):
2948                 """Report information extraction."""
2949                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2950
2951         def _real_extract(self, url):
2952                 mobj = re.match(self._VALID_URL, url)
2953                 if mobj is None:
2954                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2955                         return
2956
2957                 if '?' in url:
2958                         cchar = '&'
2959                 else:
2960                         cchar = '?'
2961                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2962                 request = urllib2.Request(json_url)
2963                 self.report_extraction(mobj.group(1))
2964                 info = None
2965                 try:
2966                         urlh = urllib2.urlopen(request)
2967                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2968                                 basename = url.split('/')[-1]
2969                                 title,ext = os.path.splitext(basename)
2970                                 title = title.decode('UTF-8')
2971                                 ext = ext.replace('.', '')
2972                                 self.report_direct_download(title)
2973                                 info = {
2974                                         'id': title,
2975                                         'url': url,
2976                                         'title': title,
2977                                         'stitle': _simplify_title(title),
2978                                         'ext': ext,
2979                                         'urlhandle': urlh
2980                                 }
2981                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2982                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2983                         return
2984                 if info is None: # Regular URL
2985                         try:
2986                                 json_code = urlh.read()
2987                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2988                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2989                                 return
2990
2991                         try:
2992                                 json_data = json.loads(json_code)
2993                                 if 'Post' in json_data:
2994                                         data = json_data['Post']
2995                                 else:
2996                                         data = json_data
2997         
2998                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2999                                 video_url = data['media']['url']
3000                                 umobj = re.match(self._URL_EXT, video_url)
3001                                 if umobj is None:
3002                                         raise ValueError('Can not determine filename extension')
3003                                 ext = umobj.group(1)
3004         
3005                                 info = {
3006                                         'id': data['item_id'],
3007                                         'url': video_url,
3008                                         'uploader': data['display_name'],
3009                                         'upload_date': upload_date,
3010                                         'title': data['title'],
3011                                         'stitle': _simplify_title(data['title']),
3012                                         'ext': ext,
3013                                         'format': data['media']['mimeType'],
3014                                         'thumbnail': data['thumbnailUrl'],
3015                                         'description': data['description'],
3016                                         'player_url': data['embedUrl']
3017                                 }
3018                         except (ValueError,KeyError), err:
3019                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3020                                 return
3021
3022                 self._downloader.increment_downloads()
3023
3024                 try:
3025                         self._downloader.process_info(info)
3026                 except UnavailableVideoError, err:
3027                         self._downloader.trouble(u'\nERROR: unable to download video')
3028
3029
3030 class MyVideoIE(InfoExtractor):
3031         """Information Extractor for myvideo.de."""
3032
3033         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3034         IE_NAME = u'myvideo'
3035
3036         def __init__(self, downloader=None):
3037                 InfoExtractor.__init__(self, downloader)
3038         
3039         def report_download_webpage(self, video_id):
3040                 """Report webpage download."""
3041                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3042
3043         def report_extraction(self, video_id):
3044                 """Report information extraction."""
3045                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3046
3047         def _real_extract(self,url):
3048                 mobj = re.match(self._VALID_URL, url)
3049                 if mobj is None:
3050                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3051                         return
3052
3053                 video_id = mobj.group(1)
3054
3055                 # Get video webpage
3056                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3057                 try:
3058                         self.report_download_webpage(video_id)
3059                         webpage = urllib2.urlopen(request).read()
3060                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3061                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3062                         return
3063
3064                 self.report_extraction(video_id)
3065                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3066                                  webpage)
3067                 if mobj is None:
3068                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3069                         return
3070                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3071
3072                 mobj = re.search('<title>([^<]+)</title>', webpage)
3073                 if mobj is None:
3074                         self._downloader.trouble(u'ERROR: unable to extract title')
3075                         return
3076
3077                 video_title = mobj.group(1)
3078                 video_title = sanitize_title(video_title)
3079
3080                 simple_title = _simplify_title(video_title)
3081
3082                 try:
3083                         self._downloader.process_info({
3084                                 'id':           video_id,
3085                                 'url':          video_url,
3086                                 'uploader':     u'NA',
3087                                 'upload_date':  u'NA',
3088                                 'title':        video_title,
3089                                 'stitle':       simple_title,
3090                                 'ext':          u'flv',
3091                                 'format':       u'NA',
3092                                 'player_url':   None,
3093                         })
3094                 except UnavailableVideoError:
3095                         self._downloader.trouble(u'\nERROR: Unable to download video')
3096
3097 class ComedyCentralIE(InfoExtractor):
3098         """Information extractor for The Daily Show and Colbert Report """
3099
3100         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3101         IE_NAME = u'comedycentral'
3102
3103         def report_extraction(self, episode_id):
3104                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3105         
3106         def report_config_download(self, episode_id):
3107                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3108
3109         def report_index_download(self, episode_id):
3110                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3111
3112         def report_player_url(self, episode_id):
3113                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3114
3115         def _real_extract(self, url):
3116                 mobj = re.match(self._VALID_URL, url)
3117                 if mobj is None:
3118                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3119                         return
3120
3121                 if mobj.group('shortname'):
3122                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3123                                 url = u'http://www.thedailyshow.com/full-episodes/'
3124                         else:
3125                                 url = u'http://www.colbertnation.com/full-episodes/'
3126                         mobj = re.match(self._VALID_URL, url)
3127                         assert mobj is not None
3128
3129                 dlNewest = not mobj.group('episode')
3130                 if dlNewest:
3131                         epTitle = mobj.group('showname')
3132                 else:
3133                         epTitle = mobj.group('episode')
3134
3135                 req = urllib2.Request(url)
3136                 self.report_extraction(epTitle)
3137                 try:
3138                         htmlHandle = urllib2.urlopen(req)
3139                         html = htmlHandle.read()
3140                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3141                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3142                         return
3143                 if dlNewest:
3144                         url = htmlHandle.geturl()
3145                         mobj = re.match(self._VALID_URL, url)
3146                         if mobj is None:
3147                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3148                                 return
3149                         if mobj.group('episode') == '':
3150                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3151                                 return
3152                         epTitle = mobj.group('episode')
3153
3154                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3155                 if len(mMovieParams) == 0:
3156                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3157                         return
3158
3159                 playerUrl_raw = mMovieParams[0][0]
3160                 self.report_player_url(epTitle)
3161                 try:
3162                         urlHandle = urllib2.urlopen(playerUrl_raw)
3163                         playerUrl = urlHandle.geturl()
3164                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3165                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3166                         return
3167
3168                 uri = mMovieParams[0][1]
3169                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3170                 self.report_index_download(epTitle)
3171                 try:
3172                         indexXml = urllib2.urlopen(indexUrl).read()
3173                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3174                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3175                         return
3176
3177                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3178                 itemEls = idoc.findall('.//item')
3179                 for itemEl in itemEls:
3180                         mediaId = itemEl.findall('./guid')[0].text
3181                         shortMediaId = mediaId.split(':')[-1]
3182                         showId = mediaId.split(':')[-2].replace('.com', '')
3183                         officialTitle = itemEl.findall('./title')[0].text
3184                         officialDate = itemEl.findall('./pubDate')[0].text
3185
3186                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3187                                                 urllib.urlencode({'uri': mediaId}))
3188                         configReq = urllib2.Request(configUrl)
3189                         self.report_config_download(epTitle)
3190                         try:
3191                                 configXml = urllib2.urlopen(configReq).read()
3192                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3193                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3194                                 return
3195
3196                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3197                         turls = []
3198                         for rendition in cdoc.findall('.//rendition'):
3199                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3200                                 turls.append(finfo)
3201
3202                         if len(turls) == 0:
3203                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3204                                 continue
3205
3206                         # For now, just pick the highest bitrate
3207                         format,video_url = turls[-1]
3208
3209                         self._downloader.increment_downloads()
3210
3211                         effTitle = showId + u'-' + epTitle
3212                         info = {
3213                                 'id': shortMediaId,
3214                                 'url': video_url,
3215                                 'uploader': showId,
3216                                 'upload_date': officialDate,
3217                                 'title': effTitle,
3218                                 'stitle': _simplify_title(effTitle),
3219                                 'ext': 'mp4',
3220                                 'format': format,
3221                                 'thumbnail': None,
3222                                 'description': officialTitle,
3223                                 'player_url': playerUrl
3224                         }
3225
3226                         try:
3227                                 self._downloader.process_info(info)
3228                         except UnavailableVideoError, err:
3229                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3230                                 continue
3231
3232
3233 class EscapistIE(InfoExtractor):
3234         """Information extractor for The Escapist """
3235
3236         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3237         IE_NAME = u'escapist'
3238
3239         def report_extraction(self, showName):
3240                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3241
3242         def report_config_download(self, showName):
3243                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3244
3245         def _real_extract(self, url):
3246                 htmlParser = HTMLParser.HTMLParser()
3247
3248                 mobj = re.match(self._VALID_URL, url)
3249                 if mobj is None:
3250                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3251                         return
3252                 showName = mobj.group('showname')
3253                 videoId = mobj.group('episode')
3254
3255                 self.report_extraction(showName)
3256                 try:
3257                         webPage = urllib2.urlopen(url).read()
3258                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3259                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3260                         return
3261
3262                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3263                 description = htmlParser.unescape(descMatch.group(1))
3264                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3265                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3266                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3267                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3268                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3269                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3270
3271                 self.report_config_download(showName)
3272                 try:
3273                         configJSON = urllib2.urlopen(configUrl).read()
3274                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3276                         return
3277
3278                 # Technically, it's JavaScript, not JSON
3279                 configJSON = configJSON.replace("'", '"')
3280
3281                 try:
3282                         config = json.loads(configJSON)
3283                 except (ValueError,), err:
3284                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3285                         return
3286
3287                 playlist = config['playlist']
3288                 videoUrl = playlist[1]['url']
3289
3290                 self._downloader.increment_downloads()
3291                 info = {
3292                         'id': videoId,
3293                         'url': videoUrl,
3294                         'uploader': showName,
3295                         'upload_date': None,
3296                         'title': showName,
3297                         'stitle': _simplify_title(showName),
3298                         'ext': 'flv',
3299                         'format': 'flv',
3300                         'thumbnail': imgUrl,
3301                         'description': description,
3302                         'player_url': playerUrl,
3303                 }
3304
3305                 try:
3306                         self._downloader.process_info(info)
3307                 except UnavailableVideoError, err:
3308                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3309
3310
3311 class CollegeHumorIE(InfoExtractor):
3312         """Information extractor for collegehumor.com"""
3313
3314         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3315         IE_NAME = u'collegehumor'
3316
3317         def report_webpage(self, video_id):
3318                 """Report information extraction."""
3319                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3320
3321         def report_extraction(self, video_id):
3322                 """Report information extraction."""
3323                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3324
3325         def _real_extract(self, url):
3326                 htmlParser = HTMLParser.HTMLParser()
3327
3328                 mobj = re.match(self._VALID_URL, url)
3329                 if mobj is None:
3330                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3331                         return
3332                 video_id = mobj.group('videoid')
3333
3334                 self.report_webpage(video_id)
3335                 request = urllib2.Request(url)
3336                 try:
3337                         webpage = urllib2.urlopen(request).read()
3338                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3339                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3340                         return
3341
3342                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3343                 if m is None:
3344                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3345                         return
3346                 internal_video_id = m.group('internalvideoid')
3347
3348                 info = {
3349                         'id': video_id,
3350                         'internal_id': internal_video_id,
3351                 }
3352
3353                 self.report_extraction(video_id)
3354                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3355                 try:
3356                         metaXml = urllib2.urlopen(xmlUrl).read()
3357                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3358                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3359                         return
3360
3361                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3362                 try:
3363                         videoNode = mdoc.findall('./video')[0]
3364                         info['description'] = videoNode.findall('./description')[0].text
3365                         info['title'] = videoNode.findall('./caption')[0].text
3366                         info['stitle'] = _simplify_title(info['title'])
3367                         info['url'] = videoNode.findall('./file')[0].text
3368                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3369                         info['ext'] = info['url'].rpartition('.')[2]
3370                         info['format'] = info['ext']
3371                 except IndexError:
3372                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3373                         return
3374
3375                 self._downloader.increment_downloads()
3376
3377                 try:
3378                         self._downloader.process_info(info)
3379                 except UnavailableVideoError, err:
3380                         self._downloader.trouble(u'\nERROR: unable to download video')
3381
3382
3383 class XVideosIE(InfoExtractor):
3384         """Information extractor for xvideos.com"""
3385
3386         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3387         IE_NAME = u'xvideos'
3388
3389         def report_webpage(self, video_id):
3390                 """Report information extraction."""
3391                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3392
3393         def report_extraction(self, video_id):
3394                 """Report information extraction."""
3395                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3396
3397         def _real_extract(self, url):
3398                 htmlParser = HTMLParser.HTMLParser()
3399
3400                 mobj = re.match(self._VALID_URL, url)
3401                 if mobj is None:
3402                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3403                         return
3404                 video_id = mobj.group(1).decode('utf-8')
3405
3406                 self.report_webpage(video_id)
3407
3408                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3409                 try:
3410                         webpage = urllib2.urlopen(request).read()
3411                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3412                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3413                         return
3414
3415                 self.report_extraction(video_id)
3416
3417
3418                 # Extract video URL
3419                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3420                 if mobj is None:
3421                         self._downloader.trouble(u'ERROR: unable to extract video url')
3422                         return
3423                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3424
3425
3426                 # Extract title
3427                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3428                 if mobj is None:
3429                         self._downloader.trouble(u'ERROR: unable to extract video title')
3430                         return
3431                 video_title = mobj.group(1).decode('utf-8')
3432
3433
3434                 # Extract video thumbnail
3435                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3436                 if mobj is None:
3437                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3438                         return
3439                 video_thumbnail = mobj.group(1).decode('utf-8')
3440
3441
3442
3443                 self._downloader.increment_downloads()
3444                 info = {
3445                         'id': video_id,
3446                         'url': video_url,
3447                         'uploader': None,
3448                         'upload_date': None,
3449                         'title': video_title,
3450                         'stitle': _simplify_title(video_title),
3451                         'ext': 'flv',
3452                         'format': 'flv',
3453                         'thumbnail': video_thumbnail,
3454                         'description': None,
3455                         'player_url': None,
3456                 }
3457
3458                 try:
3459                         self._downloader.process_info(info)
3460                 except UnavailableVideoError, err:
3461                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3462
3463
3464 class SoundcloudIE(InfoExtractor):
3465         """Information extractor for soundcloud.com
3466            To access the media, the uid of the song and a stream token
3467            must be extracted from the page source and the script must make
3468            a request to media.soundcloud.com/crossdomain.xml. Then
3469            the media can be grabbed by requesting from an url composed
3470            of the stream token and uid
3471          """
3472
3473         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3474         IE_NAME = u'soundcloud'
3475
3476         def __init__(self, downloader=None):
3477                 InfoExtractor.__init__(self, downloader)
3478
3479         def report_webpage(self, video_id):
3480                 """Report information extraction."""
3481                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3482
3483         def report_extraction(self, video_id):
3484                 """Report information extraction."""
3485                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3486
3487         def _real_extract(self, url):
3488                 htmlParser = HTMLParser.HTMLParser()
3489
3490                 mobj = re.match(self._VALID_URL, url)
3491                 if mobj is None:
3492                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3493                         return
3494
3495                 # extract uploader (which is in the url)
3496                 uploader = mobj.group(1).decode('utf-8')
3497                 # extract simple title (uploader + slug of song title)
3498                 slug_title =  mobj.group(2).decode('utf-8')
3499                 simple_title = uploader + '-' + slug_title
3500
3501                 self.report_webpage('%s/%s' % (uploader, slug_title))
3502
3503                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3504                 try:
3505                         webpage = urllib2.urlopen(request).read()
3506                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3507                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3508                         return
3509
3510                 self.report_extraction('%s/%s' % (uploader, slug_title))
3511
3512                 # extract uid and stream token that soundcloud hands out for access
3513                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3514                 if mobj:
3515                         video_id = mobj.group(1)
3516                         stream_token = mobj.group(2)
3517
3518                 # extract unsimplified title
3519                 mobj = re.search('"title":"(.*?)",', webpage)
3520                 if mobj:
3521                         title = mobj.group(1)
3522
3523                 # construct media url (with uid/token)
3524                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3525                 mediaURL = mediaURL % (video_id, stream_token)
3526
3527                 # description
3528                 description = u'No description available'
3529                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3530                 if mobj:
3531                         description = mobj.group(1)
3532                 
3533                 # upload date
3534                 upload_date = None
3535                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3536                 if mobj:
3537                         try:
3538                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3539                         except Exception, e:
3540                                 print str(e)
3541
3542                 # for soundcloud, a request to a cross domain is required for cookies
3543                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3544
3545                 try:
3546                         self._downloader.process_info({
3547                                 'id':           video_id.decode('utf-8'),
3548                                 'url':          mediaURL,
3549                                 'uploader':     uploader.decode('utf-8'),
3550                                 'upload_date':  upload_date,
3551                                 'title':        simple_title.decode('utf-8'),
3552                                 'stitle':       simple_title.decode('utf-8'),
3553                                 'ext':          u'mp3',
3554                                 'format':       u'NA',
3555                                 'player_url':   None,
3556                                 'description': description.decode('utf-8')
3557                         })
3558                 except UnavailableVideoError:
3559                         self._downloader.trouble(u'\nERROR: unable to download video')
3560
3561
3562 class InfoQIE(InfoExtractor):
3563         """Information extractor for infoq.com"""
3564
3565         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3566         IE_NAME = u'infoq'
3567
3568         def report_webpage(self, video_id):
3569                 """Report information extraction."""
3570                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3571
3572         def report_extraction(self, video_id):
3573                 """Report information extraction."""
3574                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3575
3576         def _real_extract(self, url):
3577                 htmlParser = HTMLParser.HTMLParser()
3578
3579                 mobj = re.match(self._VALID_URL, url)
3580                 if mobj is None:
3581                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3582                         return
3583
3584                 self.report_webpage(url)
3585
3586                 request = urllib2.Request(url)
3587                 try:
3588                         webpage = urllib2.urlopen(request).read()
3589                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3590                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3591                         return
3592
3593                 self.report_extraction(url)
3594
3595
3596                 # Extract video URL
3597                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3598                 if mobj is None:
3599                         self._downloader.trouble(u'ERROR: unable to extract video url')
3600                         return
3601                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3602
3603
3604                 # Extract title
3605                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3606                 if mobj is None:
3607                         self._downloader.trouble(u'ERROR: unable to extract video title')
3608                         return
3609                 video_title = mobj.group(1).decode('utf-8')
3610
3611                 # Extract description
3612                 video_description = u'No description available.'
3613                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3614                 if mobj is not None:
3615                         video_description = mobj.group(1).decode('utf-8')
3616
3617                 video_filename = video_url.split('/')[-1]
3618                 video_id, extension = video_filename.split('.')
3619
3620                 self._downloader.increment_downloads()
3621                 info = {
3622                         'id': video_id,
3623                         'url': video_url,
3624                         'uploader': None,
3625                         'upload_date': None,
3626                         'title': video_title,
3627                         'stitle': _simplify_title(video_title),
3628                         'ext': extension,
3629                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3630                         'thumbnail': None,
3631                         'description': video_description,
3632                         'player_url': None,
3633                 }
3634
3635                 try:
3636                         self._downloader.process_info(info)
3637                 except UnavailableVideoError, err:
3638                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3639
3640 class MixcloudIE(InfoExtractor):
3641         """Information extractor for www.mixcloud.com"""
3642         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3643         IE_NAME = u'mixcloud'
3644
3645         def __init__(self, downloader=None):
3646                 InfoExtractor.__init__(self, downloader)
3647
3648         def report_download_json(self, file_id):
3649                 """Report JSON download."""
3650                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3651
3652         def report_extraction(self, file_id):
3653                 """Report information extraction."""
3654                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3655
3656         def get_urls(self, jsonData, fmt, bitrate='best'):
3657                 """Get urls from 'audio_formats' section in json"""
3658                 file_url = None
3659                 try:
3660                         bitrate_list = jsonData[fmt]
3661                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3662                                 bitrate = max(bitrate_list) # select highest
3663
3664                         url_list = jsonData[fmt][bitrate]
3665                 except TypeError: # we have no bitrate info.
3666                         url_list = jsonData[fmt]
3667                                 
3668                 return url_list
3669
3670         def check_urls(self, url_list):
3671                 """Returns 1st active url from list"""
3672                 for url in url_list:
3673                         try:
3674                                 urllib2.urlopen(url)
3675                                 return url
3676                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3677                                 url = None
3678
3679                 return None
3680
3681         def _print_formats(self, formats):
3682                 print 'Available formats:'
3683                 for fmt in formats.keys():
3684                         for b in formats[fmt]:
3685                                 try:
3686                                         ext = formats[fmt][b][0]
3687                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3688                                 except TypeError: # we have no bitrate info
3689                                         ext = formats[fmt][0]
3690                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3691                                         break
3692
3693         def _real_extract(self, url):
3694                 mobj = re.match(self._VALID_URL, url)
3695                 if mobj is None:
3696                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3697                         return
3698                 # extract uploader & filename from url
3699                 uploader = mobj.group(1).decode('utf-8')
3700                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3701
3702                 # construct API request
3703                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3704                 # retrieve .json file with links to files
3705                 request = urllib2.Request(file_url)
3706                 try:
3707                         self.report_download_json(file_url)
3708                         jsonData = urllib2.urlopen(request).read()
3709                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3710                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3711                         return
3712
3713                 # parse JSON
3714                 json_data = json.loads(jsonData)
3715                 player_url = json_data['player_swf_url']
3716                 formats = dict(json_data['audio_formats'])
3717
3718                 req_format = self._downloader.params.get('format', None)
3719                 bitrate = None
3720
3721                 if self._downloader.params.get('listformats', None):
3722                         self._print_formats(formats)
3723                         return
3724
3725                 if req_format is None or req_format == 'best':
3726                         for format_param in formats.keys():
3727                                 url_list = self.get_urls(formats, format_param)
3728                                 # check urls
3729                                 file_url = self.check_urls(url_list)
3730                                 if file_url is not None:
3731                                         break # got it!
3732                 else:
3733                         if req_format not in formats.keys():
3734                                 self._downloader.trouble(u'ERROR: format is not available')
3735                                 return
3736
3737                         url_list = self.get_urls(formats, req_format)
3738                         file_url = self.check_urls(url_list)
3739                         format_param = req_format
3740
3741                 # We have audio
3742                 self._downloader.increment_downloads()
3743                 try:
3744                         # Process file information
3745                         self._downloader.process_info({
3746                                 'id':           file_id.decode('utf-8'),
3747                                 'url':          file_url.decode('utf-8'),
3748                                 'uploader':     uploader.decode('utf-8'),
3749                                 'upload_date':  u'NA',
3750                                 'title':        json_data['name'],
3751                                 'stitle':       _simplify_title(json_data['name']),
3752                                 'ext':          file_url.split('.')[-1].decode('utf-8'),
3753                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3754                                 'thumbnail':    json_data['thumbnail_url'],
3755                                 'description':  json_data['description'],
3756                                 'player_url':   player_url.decode('utf-8'),
3757                         })
3758                 except UnavailableVideoError, err:
3759                         self._downloader.trouble(u'ERROR: unable to download file')
3760
3761 class StanfordOpenClassroomIE(InfoExtractor):
3762         """Information extractor for Stanford's Open ClassRoom"""
3763
3764         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3765         IE_NAME = u'stanfordoc'
3766
3767         def report_download_webpage(self, objid):
3768                 """Report information extraction."""
3769                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3770
3771         def report_extraction(self, video_id):
3772                 """Report information extraction."""
3773                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3774
3775         def _real_extract(self, url):
3776                 mobj = re.match(self._VALID_URL, url)
3777                 if mobj is None:
3778                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3779                         return
3780
3781                 if mobj.group('course') and mobj.group('video'): # A specific video
3782                         course = mobj.group('course')
3783                         video = mobj.group('video')
3784                         info = {
3785                                 'id': _simplify_title(course + '_' + video),
3786                         }
3787         
3788                         self.report_extraction(info['id'])
3789                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3790                         xmlUrl = baseUrl + video + '.xml'
3791                         try:
3792                                 metaXml = urllib2.urlopen(xmlUrl).read()
3793                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3794                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3795                                 return
3796                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3797                         try:
3798                                 info['title'] = mdoc.findall('./title')[0].text
3799                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3800                         except IndexError:
3801                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3802                                 return
3803                         info['stitle'] = _simplify_title(info['title'])
3804                         info['ext'] = info['url'].rpartition('.')[2]
3805                         info['format'] = info['ext']
3806                         self._downloader.increment_downloads()
3807                         try:
3808                                 self._downloader.process_info(info)
3809                         except UnavailableVideoError, err:
3810                                 self._downloader.trouble(u'\nERROR: unable to download video')
3811                 elif mobj.group('course'): # A course page
3812                         unescapeHTML = HTMLParser.HTMLParser().unescape
3813
3814                         course = mobj.group('course')
3815                         info = {
3816                                 'id': _simplify_title(course),
3817                                 'type': 'playlist',
3818                         }
3819
3820                         self.report_download_webpage(info['id'])
3821                         try:
3822                                 coursepage = urllib2.urlopen(url).read()
3823                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3824                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3825                                 return
3826
3827                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3828                         if m:
3829                                 info['title'] = unescapeHTML(m.group(1))
3830                         else:
3831                                 info['title'] = info['id']
3832                         info['stitle'] = _simplify_title(info['title'])
3833
3834                         m = re.search('<description>([^<]+)</description>', coursepage)
3835                         if m:
3836                                 info['description'] = unescapeHTML(m.group(1))
3837
3838                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3839                         info['list'] = [
3840                                 {
3841                                         'type': 'reference',
3842                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3843                                 }
3844                                         for vpage in links]
3845
3846                         for entry in info['list']:
3847                                 assert entry['type'] == 'reference'
3848                                 self.extract(entry['url'])
3849                 else: # Root page
3850                         unescapeHTML = HTMLParser.HTMLParser().unescape
3851
3852                         info = {
3853                                 'id': 'Stanford OpenClassroom',
3854                                 'type': 'playlist',
3855                         }
3856
3857                         self.report_download_webpage(info['id'])
3858                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3859                         try:
3860                                 rootpage = urllib2.urlopen(rootURL).read()
3861                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3862                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3863                                 return
3864
3865                         info['title'] = info['id']
3866                         info['stitle'] = _simplify_title(info['title'])
3867
3868                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3869                         info['list'] = [
3870                                 {
3871                                         'type': 'reference',
3872                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3873                                 }
3874                                         for cpage in links]
3875
3876                         for entry in info['list']:
3877                                 assert entry['type'] == 'reference'
3878                                 self.extract(entry['url'])
3879
3880
3881 class PostProcessor(object):
3882         """Post Processor class.
3883
3884         PostProcessor objects can be added to downloaders with their
3885         add_post_processor() method. When the downloader has finished a
3886         successful download, it will take its internal chain of PostProcessors
3887         and start calling the run() method on each one of them, first with
3888         an initial argument and then with the returned value of the previous
3889         PostProcessor.
3890
3891         The chain will be stopped if one of them ever returns None or the end
3892         of the chain is reached.
3893
3894         PostProcessor objects follow a "mutual registration" process similar
3895         to InfoExtractor objects.
3896         """
3897
3898         _downloader = None
3899
3900         def __init__(self, downloader=None):
3901                 self._downloader = downloader
3902
3903         def set_downloader(self, downloader):
3904                 """Sets the downloader for this PP."""
3905                 self._downloader = downloader
3906
3907         def run(self, information):
3908                 """Run the PostProcessor.
3909
3910                 The "information" argument is a dictionary like the ones
3911                 composed by InfoExtractors. The only difference is that this
3912                 one has an extra field called "filepath" that points to the
3913                 downloaded file.
3914
3915                 When this method returns None, the postprocessing chain is
3916                 stopped. However, this method may return an information
3917                 dictionary that will be passed to the next postprocessing
3918                 object in the chain. It can be the one it received after
3919                 changing some fields.
3920
3921                 In addition, this method may raise a PostProcessingError
3922                 exception that will be taken into account by the downloader
3923                 it was called from.
3924                 """
3925                 return information # by default, do nothing
3926
3927
3928 class FFmpegExtractAudioPP(PostProcessor):
3929
3930         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3931                 PostProcessor.__init__(self, downloader)
3932                 if preferredcodec is None:
3933                         preferredcodec = 'best'
3934                 self._preferredcodec = preferredcodec
3935                 self._preferredquality = preferredquality
3936                 self._keepvideo = keepvideo
3937
3938         @staticmethod
3939         def get_audio_codec(path):
3940                 try:
3941                         cmd = ['ffprobe', '-show_streams', '--', path]
3942                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3943                         output = handle.communicate()[0]
3944                         if handle.wait() != 0:
3945                                 return None
3946                 except (IOError, OSError):
3947                         return None
3948                 audio_codec = None
3949                 for line in output.split('\n'):
3950                         if line.startswith('codec_name='):
3951                                 audio_codec = line.split('=')[1].strip()
3952                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3953                                 return audio_codec
3954                 return None
3955
3956         @staticmethod
3957         def run_ffmpeg(path, out_path, codec, more_opts):
3958                 try:
3959                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3960                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3961                         return (ret == 0)
3962                 except (IOError, OSError):
3963                         return False
3964
3965         def run(self, information):
3966                 path = information['filepath']
3967
3968                 filecodec = self.get_audio_codec(path)
3969                 if filecodec is None:
3970                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3971                         return None
3972
3973                 more_opts = []
3974                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3975                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
3976                                 # Lossless, but in another container
3977                                 acodec = 'copy'
3978                                 extension = self._preferredcodec
3979                                 more_opts = ['-absf', 'aac_adtstoasc']
3980                         elif filecodec in ['aac', 'mp3', 'vorbis']:
3981                                 # Lossless if possible
3982                                 acodec = 'copy'
3983                                 extension = filecodec
3984                                 if filecodec == 'aac':
3985                                         more_opts = ['-f', 'adts']
3986                                 if filecodec == 'vorbis':
3987                                         extension = 'ogg'
3988                         else:
3989                                 # MP3 otherwise.
3990                                 acodec = 'libmp3lame'
3991                                 extension = 'mp3'
3992                                 more_opts = []
3993                                 if self._preferredquality is not None:
3994                                         more_opts += ['-ab', self._preferredquality]
3995                 else:
3996                         # We convert the audio (lossy)
3997                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3998                         extension = self._preferredcodec
3999                         more_opts = []
4000                         if self._preferredquality is not None:
4001                                 more_opts += ['-ab', self._preferredquality]
4002                         if self._preferredcodec == 'aac':
4003                                 more_opts += ['-f', 'adts']
4004                         if self._preferredcodec == 'm4a':
4005                                 more_opts += ['-absf', 'aac_adtstoasc']
4006                         if self._preferredcodec == 'vorbis':
4007                                 extension = 'ogg'
4008
4009                 (prefix, ext) = os.path.splitext(path)
4010                 new_path = prefix + '.' + extension
4011                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4012                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
4013
4014                 if not status:
4015                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
4016                         return None
4017
4018                 # Try to update the date time for extracted audio file.
4019                 if information.get('filetime') is not None:
4020                         try:
4021                                 os.utime(new_path, (time.time(), information['filetime']))
4022                         except:
4023                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4024
4025                 if not self._keepvideo:
4026                         try:
4027                                 os.remove(path)
4028                         except (IOError, OSError):
4029                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4030                                 return None
4031
4032                 information['filepath'] = new_path
4033                 return information
4034
4035
4036 def updateSelf(downloader, filename):
4037         ''' Update the program file with the latest version from the repository '''
4038         # Note: downloader only used for options
4039         if not os.access(filename, os.W_OK):
4040                 sys.exit('ERROR: no write permissions on %s' % filename)
4041
4042         downloader.to_screen('Updating to latest version...')
4043
4044         try:
4045                 try:
4046                         urlh = urllib.urlopen(UPDATE_URL)
4047                         newcontent = urlh.read()
4048                         
4049                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4050                         if vmatch is not None and vmatch.group(1) == __version__:
4051                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4052                                 return
4053                 finally:
4054                         urlh.close()
4055         except (IOError, OSError), err:
4056                 sys.exit('ERROR: unable to download latest version')
4057
4058         try:
4059                 outf = open(filename, 'wb')
4060                 try:
4061                         outf.write(newcontent)
4062                 finally:
4063                         outf.close()
4064         except (IOError, OSError), err:
4065                 sys.exit('ERROR: unable to overwrite current version')
4066
4067         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4068
4069 def parseOpts():
4070         # Deferred imports
4071         import getpass
4072         import optparse
4073         import shlex
4074
4075         def _readOptions(filename):
4076                 try:
4077                         optionf = open(filename)
4078                 except IOError:
4079                         return [] # silently skip if file is not present
4080                 try:
4081                         res = []
4082                         for l in optionf:
4083                                 res += shlex.split(l, comments=True)
4084                 finally:
4085                         optionf.close()
4086                 return res
4087
4088         def _format_option_string(option):
4089                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4090
4091                 opts = []
4092
4093                 if option._short_opts: opts.append(option._short_opts[0])
4094                 if option._long_opts: opts.append(option._long_opts[0])
4095                 if len(opts) > 1: opts.insert(1, ', ')
4096
4097                 if option.takes_value(): opts.append(' %s' % option.metavar)
4098
4099                 return "".join(opts)
4100
4101         def _find_term_columns():
4102                 columns = os.environ.get('COLUMNS', None)
4103                 if columns:
4104                         return int(columns)
4105
4106                 try:
4107                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4108                         out,err = sp.communicate()
4109                         return int(out.split()[1])
4110                 except:
4111                         pass
4112                 return None
4113
4114         max_width = 80
4115         max_help_position = 80
4116
4117         # No need to wrap help messages if we're on a wide console
4118         columns = _find_term_columns()
4119         if columns: max_width = columns
4120
4121         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4122         fmt.format_option_strings = _format_option_string
4123
4124         kw = {
4125                 'version'   : __version__,
4126                 'formatter' : fmt,
4127                 'usage' : '%prog [options] url [url...]',
4128                 'conflict_handler' : 'resolve',
4129         }
4130
4131         parser = optparse.OptionParser(**kw)
4132
4133         # option groups
4134         general        = optparse.OptionGroup(parser, 'General Options')
4135         selection      = optparse.OptionGroup(parser, 'Video Selection')
4136         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4137         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4138         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4139         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4140         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4141
4142         general.add_option('-h', '--help',
4143                         action='help', help='print this help text and exit')
4144         general.add_option('-v', '--version',
4145                         action='version', help='print program version and exit')
4146         general.add_option('-U', '--update',
4147                         action='store_true', dest='update_self', help='update this program to latest version')
4148         general.add_option('-i', '--ignore-errors',
4149                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4150         general.add_option('-r', '--rate-limit',
4151                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4152         general.add_option('-R', '--retries',
4153                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4154         general.add_option('--dump-user-agent',
4155                         action='store_true', dest='dump_user_agent',
4156                         help='display the current browser identification', default=False)
4157         general.add_option('--list-extractors',
4158                         action='store_true', dest='list_extractors',
4159                         help='List all supported extractors and the URLs they would handle', default=False)
4160
4161         selection.add_option('--playlist-start',
4162                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4163         selection.add_option('--playlist-end',
4164                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4165         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4166         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4167         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4168
4169         authentication.add_option('-u', '--username',
4170                         dest='username', metavar='USERNAME', help='account username')
4171         authentication.add_option('-p', '--password',
4172                         dest='password', metavar='PASSWORD', help='account password')
4173         authentication.add_option('-n', '--netrc',
4174                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4175
4176
4177         video_format.add_option('-f', '--format',
4178                         action='store', dest='format', metavar='FORMAT', help='video format code')
4179         video_format.add_option('--all-formats',
4180                         action='store_const', dest='format', help='download all available video formats', const='all')
4181         video_format.add_option('--prefer-free-formats',
4182                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4183         video_format.add_option('--max-quality',
4184                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4185         video_format.add_option('-F', '--list-formats',
4186                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4187
4188
4189         verbosity.add_option('-q', '--quiet',
4190                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4191         verbosity.add_option('-s', '--simulate',
4192                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4193         verbosity.add_option('--skip-download',
4194                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4195         verbosity.add_option('-g', '--get-url',
4196                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4197         verbosity.add_option('-e', '--get-title',
4198                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4199         verbosity.add_option('--get-thumbnail',
4200                         action='store_true', dest='getthumbnail',
4201                         help='simulate, quiet but print thumbnail URL', default=False)
4202         verbosity.add_option('--get-description',
4203                         action='store_true', dest='getdescription',
4204                         help='simulate, quiet but print video description', default=False)
4205         verbosity.add_option('--get-filename',
4206                         action='store_true', dest='getfilename',
4207                         help='simulate, quiet but print output filename', default=False)
4208         verbosity.add_option('--get-format',
4209                         action='store_true', dest='getformat',
4210                         help='simulate, quiet but print output format', default=False)
4211         verbosity.add_option('--no-progress',
4212                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4213         verbosity.add_option('--console-title',
4214                         action='store_true', dest='consoletitle',
4215                         help='display progress in console titlebar', default=False)
4216
4217
4218         filesystem.add_option('-t', '--title',
4219                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4220         filesystem.add_option('-l', '--literal',
4221                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4222         filesystem.add_option('-A', '--auto-number',
4223                         action='store_true', dest='autonumber',
4224                         help='number downloaded files starting from 00000', default=False)
4225         filesystem.add_option('-o', '--output',
4226                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4227         filesystem.add_option('-a', '--batch-file',
4228                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4229         filesystem.add_option('-w', '--no-overwrites',
4230                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4231         filesystem.add_option('-c', '--continue',
4232                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4233         filesystem.add_option('--no-continue',
4234                         action='store_false', dest='continue_dl',
4235                         help='do not resume partially downloaded files (restart from beginning)')
4236         filesystem.add_option('--cookies',
4237                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4238         filesystem.add_option('--no-part',
4239                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4240         filesystem.add_option('--no-mtime',
4241                         action='store_false', dest='updatetime',
4242                         help='do not use the Last-modified header to set the file modification time', default=True)
4243         filesystem.add_option('--write-description',
4244                         action='store_true', dest='writedescription',
4245                         help='write video description to a .description file', default=False)
4246         filesystem.add_option('--write-info-json',
4247                         action='store_true', dest='writeinfojson',
4248                         help='write video metadata to a .info.json file', default=False)
4249
4250
4251         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4252                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4253         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4254                         help='"best", "aac", "vorbis", "mp3", or "m4a"; best by default')
4255         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4256                         help='ffmpeg audio bitrate specification, 128k by default')
4257         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4258                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4259
4260
4261         parser.add_option_group(general)
4262         parser.add_option_group(selection)
4263         parser.add_option_group(filesystem)
4264         parser.add_option_group(verbosity)
4265         parser.add_option_group(video_format)
4266         parser.add_option_group(authentication)
4267         parser.add_option_group(postproc)
4268
4269         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4270         if xdg_config_home:
4271                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4272         else:
4273                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4274         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4275         opts, args = parser.parse_args(argv)
4276
4277         return parser, opts, args
4278
4279 def gen_extractors():
4280         """ Return a list of an instance of every supported extractor.
4281         The order does matter; the first extractor matched is the one handling the URL.
4282         """
4283         youtube_ie = YoutubeIE()
4284         google_ie = GoogleIE()
4285         yahoo_ie = YahooIE()
4286         return [
4287                 YoutubePlaylistIE(youtube_ie),
4288                 YoutubeUserIE(youtube_ie),
4289                 YoutubeSearchIE(youtube_ie),
4290                 youtube_ie,
4291                 MetacafeIE(youtube_ie),
4292                 DailymotionIE(),
4293                 google_ie,
4294                 GoogleSearchIE(google_ie),
4295                 PhotobucketIE(),
4296                 yahoo_ie,
4297                 YahooSearchIE(yahoo_ie),
4298                 DepositFilesIE(),
4299                 FacebookIE(),
4300                 BlipTVIE(),
4301                 VimeoIE(),
4302                 MyVideoIE(),
4303                 ComedyCentralIE(),
4304                 EscapistIE(),
4305                 CollegeHumorIE(),
4306                 XVideosIE(),
4307                 SoundcloudIE(),
4308                 InfoQIE(),
4309                 MixcloudIE(),
4310                 StanfordOpenClassroomIE(),
4311
4312                 GenericIE()
4313         ]
4314
4315 def _real_main():
4316         parser, opts, args = parseOpts()
4317
4318         # Open appropriate CookieJar
4319         if opts.cookiefile is None:
4320                 jar = cookielib.CookieJar()
4321         else:
4322                 try:
4323                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4324                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4325                                 jar.load()
4326                 except (IOError, OSError), err:
4327                         sys.exit(u'ERROR: unable to open cookie file')
4328
4329         # Dump user agent
4330         if opts.dump_user_agent:
4331                 print std_headers['User-Agent']
4332                 sys.exit(0)
4333
4334         # Batch file verification
4335         batchurls = []
4336         if opts.batchfile is not None:
4337                 try:
4338                         if opts.batchfile == '-':
4339                                 batchfd = sys.stdin
4340                         else:
4341                                 batchfd = open(opts.batchfile, 'r')
4342                         batchurls = batchfd.readlines()
4343                         batchurls = [x.strip() for x in batchurls]
4344                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4345                 except IOError:
4346                         sys.exit(u'ERROR: batch file could not be read')
4347         all_urls = batchurls + args
4348
4349         # General configuration
4350         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4351         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4352         urllib2.install_opener(opener)
4353         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4354
4355         extractors = gen_extractors()
4356
4357         if opts.list_extractors:
4358                 for ie in extractors:
4359                         print(ie.IE_NAME)
4360                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4361                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4362                         for mu in matchedUrls:
4363                                 print(u'  ' + mu)
4364                 sys.exit(0)
4365
4366         # Conflicting, missing and erroneous options
4367         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4368                 parser.error(u'using .netrc conflicts with giving username/password')
4369         if opts.password is not None and opts.username is None:
4370                 parser.error(u'account username missing')
4371         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4372                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4373         if opts.usetitle and opts.useliteral:
4374                 parser.error(u'using title conflicts with using literal title')
4375         if opts.username is not None and opts.password is None:
4376                 opts.password = getpass.getpass(u'Type account password and press return:')
4377         if opts.ratelimit is not None:
4378                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4379                 if numeric_limit is None:
4380                         parser.error(u'invalid rate limit specified')
4381                 opts.ratelimit = numeric_limit
4382         if opts.retries is not None:
4383                 try:
4384                         opts.retries = long(opts.retries)
4385                 except (TypeError, ValueError), err:
4386                         parser.error(u'invalid retry count specified')
4387         try:
4388                 opts.playliststart = int(opts.playliststart)
4389                 if opts.playliststart <= 0:
4390                         raise ValueError(u'Playlist start must be positive')
4391         except (TypeError, ValueError), err:
4392                 parser.error(u'invalid playlist start number specified')
4393         try:
4394                 opts.playlistend = int(opts.playlistend)
4395                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4396                         raise ValueError(u'Playlist end must be greater than playlist start')
4397         except (TypeError, ValueError), err:
4398                 parser.error(u'invalid playlist end number specified')
4399         if opts.extractaudio:
4400                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a']:
4401                         parser.error(u'invalid audio format specified')
4402
4403         # File downloader
4404         fd = FileDownloader({
4405                 'usenetrc': opts.usenetrc,
4406                 'username': opts.username,
4407                 'password': opts.password,
4408                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4409                 'forceurl': opts.geturl,
4410                 'forcetitle': opts.gettitle,
4411                 'forcethumbnail': opts.getthumbnail,
4412                 'forcedescription': opts.getdescription,
4413                 'forcefilename': opts.getfilename,
4414                 'forceformat': opts.getformat,
4415                 'simulate': opts.simulate,
4416                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4417                 'format': opts.format,
4418                 'format_limit': opts.format_limit,
4419                 'listformats': opts.listformats,
4420                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4421                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4422                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4423                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4424                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4425                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4426                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4427                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4428                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4429                         or u'%(id)s.%(ext)s'),
4430                 'ignoreerrors': opts.ignoreerrors,
4431                 'ratelimit': opts.ratelimit,
4432                 'nooverwrites': opts.nooverwrites,
4433                 'retries': opts.retries,
4434                 'continuedl': opts.continue_dl,
4435                 'noprogress': opts.noprogress,
4436                 'playliststart': opts.playliststart,
4437                 'playlistend': opts.playlistend,
4438                 'logtostderr': opts.outtmpl == '-',
4439                 'consoletitle': opts.consoletitle,
4440                 'nopart': opts.nopart,
4441                 'updatetime': opts.updatetime,
4442                 'writedescription': opts.writedescription,
4443                 'writeinfojson': opts.writeinfojson,
4444                 'matchtitle': opts.matchtitle,
4445                 'rejecttitle': opts.rejecttitle,
4446                 'max_downloads': opts.max_downloads,
4447                 'prefer_free_formats': opts.prefer_free_formats,
4448                 })
4449         for extractor in extractors:
4450                 fd.add_info_extractor(extractor)
4451
4452         # PostProcessors
4453         if opts.extractaudio:
4454                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4455
4456         # Update version
4457         if opts.update_self:
4458                 updateSelf(fd, sys.argv[0])
4459
4460         # Maybe do nothing
4461         if len(all_urls) < 1:
4462                 if not opts.update_self:
4463                         parser.error(u'you must provide at least one URL')
4464                 else:
4465                         sys.exit()
4466         
4467         try:
4468                 retcode = fd.download(all_urls)
4469         except MaxDownloadsReached:
4470                 fd.to_screen(u'--max-download limit reached, aborting.')
4471                 retcode = 101
4472
4473         # Dump cookie jar if requested
4474         if opts.cookiefile is not None:
4475                 try:
4476                         jar.save()
4477                 except (IOError, OSError), err:
4478                         sys.exit(u'ERROR: unable to save cookie jar')
4479
4480         sys.exit(retcode)
4481
4482 def main():
4483         try:
4484                 _real_main()
4485         except DownloadError:
4486                 sys.exit(1)
4487         except SameFileError:
4488                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4489         except KeyboardInterrupt:
4490                 sys.exit(u'\nERROR: Interrupted by user')
4491
4492 if __name__ == '__main__':
4493         main()
4494
4495 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: