Release 2011.12.08
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         'shizeeg',
18         )
19
20 __license__ = 'Public Domain'
21 __version__ = '2011.12.08'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48         import ctypes
49
50 try:
51         import email.utils
52 except ImportError: # Python 2.4
53         import email.Utils
54 try:
55         import cStringIO as StringIO
56 except ImportError:
57         import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61         from urlparse import parse_qs
62 except ImportError:
63         from cgi import parse_qs
64
65 try:
66         import lxml.etree
67 except ImportError:
68         pass # Handled below
69
70 try:
71         import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79         'Accept-Encoding': 'gzip, deflate',
80         'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84         import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86         import re
87         class json(object):
88                 @staticmethod
89                 def loads(s):
90                         s = s.decode('UTF-8')
91                         def raiseError(msg, i):
92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93                         def skipSpace(i, expectMore=True):
94                                 while i < len(s) and s[i] in ' \t\r\n':
95                                         i += 1
96                                 if expectMore:
97                                         if i >= len(s):
98                                                 raiseError('Premature end', i)
99                                 return i
100                         def decodeEscape(match):
101                                 esc = match.group(1)
102                                 _STATIC = {
103                                         '"': '"',
104                                         '\\': '\\',
105                                         '/': '/',
106                                         'b': unichr(0x8),
107                                         'f': unichr(0xc),
108                                         'n': '\n',
109                                         'r': '\r',
110                                         't': '\t',
111                                 }
112                                 if esc in _STATIC:
113                                         return _STATIC[esc]
114                                 if esc[0] == 'u':
115                                         if len(esc) == 1+4:
116                                                 return unichr(int(esc[1:5], 16))
117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
118                                                 hi = int(esc[1:5], 16)
119                                                 low = int(esc[7:11], 16)
120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121                                 raise ValueError('Unknown escape ' + str(esc))
122                         def parseString(i):
123                                 i += 1
124                                 e = i
125                                 while True:
126                                         e = s.index('"', e)
127                                         bslashes = 0
128                                         while s[e-bslashes-1] == '\\':
129                                                 bslashes += 1
130                                         if bslashes % 2 == 1:
131                                                 e += 1
132                                                 continue
133                                         break
134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135                                 stri = rexp.sub(decodeEscape, s[i:e])
136                                 return (e+1,stri)
137                         def parseObj(i):
138                                 i += 1
139                                 res = {}
140                                 i = skipSpace(i)
141                                 if s[i] == '}': # Empty dictionary
142                                         return (i+1,res)
143                                 while True:
144                                         if s[i] != '"':
145                                                 raiseError('Expected a string object key', i)
146                                         i,key = parseString(i)
147                                         i = skipSpace(i)
148                                         if i >= len(s) or s[i] != ':':
149                                                 raiseError('Expected a colon', i)
150                                         i,val = parse(i+1)
151                                         res[key] = val
152                                         i = skipSpace(i)
153                                         if s[i] == '}':
154                                                 return (i+1, res)
155                                         if s[i] != ',':
156                                                 raiseError('Expected comma or closing curly brace', i)
157                                         i = skipSpace(i+1)
158                         def parseArray(i):
159                                 res = []
160                                 i = skipSpace(i+1)
161                                 if s[i] == ']': # Empty array
162                                         return (i+1,res)
163                                 while True:
164                                         i,val = parse(i)
165                                         res.append(val)
166                                         i = skipSpace(i) # Raise exception if premature end
167                                         if s[i] == ']':
168                                                 return (i+1, res)
169                                         if s[i] != ',':
170                                                 raiseError('Expected a comma or closing bracket', i)
171                                         i = skipSpace(i+1)
172                         def parseDiscrete(i):
173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
174                                         if s.startswith(k, i):
175                                                 return (i+len(k), v)
176                                 raiseError('Not a boolean (or null)', i)
177                         def parseNumber(i):
178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179                                 if mobj is None:
180                                         raiseError('Not a number', i)
181                                 nums = mobj.group(1)
182                                 if '.' in nums or 'e' in nums or 'E' in nums:
183                                         return (i+len(nums), float(nums))
184                                 return (i+len(nums), int(nums))
185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186                         def parse(i):
187                                 i = skipSpace(i)
188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
189                                 i = skipSpace(i, False)
190                                 return (i,res)
191                         i,res = parse(0)
192                         if i < len(s):
193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194                         return res
195
196 def preferredencoding():
197         """Get preferred encoding.
198
199         Returns the best encoding scheme for the system, based on
200         locale.getpreferredencoding() and some further tweaks.
201         """
202         def yield_preferredencoding():
203                 try:
204                         pref = locale.getpreferredencoding()
205                         u'TEST'.encode(pref)
206                 except:
207                         pref = 'UTF-8'
208                 while True:
209                         yield pref
210         return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214         """Transforms an HTML entity to a Unicode character.
215
216         This function receives a match object and is intended to be used with
217         the re.sub() function.
218         """
219         entity = matchobj.group(1)
220
221         # Known non-numeric HTML entity
222         if entity in htmlentitydefs.name2codepoint:
223                 return unichr(htmlentitydefs.name2codepoint[entity])
224
225         # Unicode character
226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
227         if mobj is not None:
228                 numstr = mobj.group(1)
229                 if numstr.startswith(u'x'):
230                         base = 16
231                         numstr = u'0%s' % numstr
232                 else:
233                         base = 10
234                 return unichr(long(numstr, base))
235
236         # Unknown entity in name, return its literal representation
237         return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241         """Sanitizes a video title so it could be used as part of a filename."""
242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243         return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247         """Try to open the given filename, and slightly tweak it if this fails.
248
249         Attempts to open the given filename. If this fails, it tries to change
250         the filename slightly, step by step, until it's either able to open it
251         or it fails and raises a final exception, like the standard open()
252         function.
253
254         It returns the tuple (stream, definitive_file_name).
255         """
256         try:
257                 if filename == u'-':
258                         if sys.platform == 'win32':
259                                 import msvcrt
260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261                         return (sys.stdout, filename)
262                 stream = open(filename, open_mode)
263                 return (stream, filename)
264         except (IOError, OSError), err:
265                 # In case of error, try to remove win32 forbidden chars
266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268                 # An exception here should be caught in the caller
269                 stream = open(filename, open_mode)
270                 return (stream, filename)
271
272
273 def timeconvert(timestr):
274         """Convert RFC 2822 defined time string into system timestamp"""
275         timestamp = None
276         timetuple = email.utils.parsedate_tz(timestr)
277         if timetuple is not None:
278                 timestamp = email.utils.mktime_tz(timetuple)
279         return timestamp
280
281 def _simplify_title(title):
282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283         return expr.sub(u'_', title).strip(u'_')
284
285 def _orderedSet(iterable):
286         """ Remove all duplicates from the input iterable """
287         res = []
288         for el in iterable:
289                 if el not in res:
290                         res.append(el)
291         return res
292
293 class DownloadError(Exception):
294         """Download Error exception.
295
296         This exception may be thrown by FileDownloader objects if they are not
297         configured to continue on errors. They will contain the appropriate
298         error message.
299         """
300         pass
301
302
303 class SameFileError(Exception):
304         """Same File exception.
305
306         This exception will be thrown by FileDownloader objects if they detect
307         multiple files would have to be downloaded to the same file on disk.
308         """
309         pass
310
311
312 class PostProcessingError(Exception):
313         """Post Processing exception.
314
315         This exception may be raised by PostProcessor's .run() method to
316         indicate an error in the postprocessing task.
317         """
318         pass
319
320 class MaxDownloadsReached(Exception):
321         """ --max-downloads limit has been reached. """
322         pass
323
324
325 class UnavailableVideoError(Exception):
326         """Unavailable Format exception.
327
328         This exception will be thrown when a video is requested
329         in a format that is not available for that video.
330         """
331         pass
332
333
334 class ContentTooShortError(Exception):
335         """Content Too Short exception.
336
337         This exception may be raised by FileDownloader objects when a file they
338         download is too small for what the server announced first, indicating
339         the connection was probably interrupted.
340         """
341         # Both in bytes
342         downloaded = None
343         expected = None
344
345         def __init__(self, downloaded, expected):
346                 self.downloaded = downloaded
347                 self.expected = expected
348
349
350 class YoutubeDLHandler(urllib2.HTTPHandler):
351         """Handler for HTTP requests and responses.
352
353         This class, when installed with an OpenerDirector, automatically adds
354         the standard headers to every HTTP request and handles gzipped and
355         deflated responses from web servers. If compression is to be avoided in
356         a particular request, the original request in the program code only has
357         to include the HTTP header "Youtubedl-No-Compression", which will be
358         removed before making the real request.
359
360         Part of this code was copied from:
361
362         http://techknack.net/python-urllib2-handlers/
363
364         Andrew Rowls, the author of that code, agreed to release it to the
365         public domain.
366         """
367
368         @staticmethod
369         def deflate(data):
370                 try:
371                         return zlib.decompress(data, -zlib.MAX_WBITS)
372                 except zlib.error:
373                         return zlib.decompress(data)
374
375         @staticmethod
376         def addinfourl_wrapper(stream, headers, url, code):
377                 if hasattr(urllib2.addinfourl, 'getcode'):
378                         return urllib2.addinfourl(stream, headers, url, code)
379                 ret = urllib2.addinfourl(stream, headers, url)
380                 ret.code = code
381                 return ret
382
383         def http_request(self, req):
384                 for h in std_headers:
385                         if h in req.headers:
386                                 del req.headers[h]
387                         req.add_header(h, std_headers[h])
388                 if 'Youtubedl-no-compression' in req.headers:
389                         if 'Accept-encoding' in req.headers:
390                                 del req.headers['Accept-encoding']
391                         del req.headers['Youtubedl-no-compression']
392                 return req
393
394         def http_response(self, req, resp):
395                 old_resp = resp
396                 # gzip
397                 if resp.headers.get('Content-encoding', '') == 'gzip':
398                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
399                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
400                         resp.msg = old_resp.msg
401                 # deflate
402                 if resp.headers.get('Content-encoding', '') == 'deflate':
403                         gz = StringIO.StringIO(self.deflate(resp.read()))
404                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
405                         resp.msg = old_resp.msg
406                 return resp
407
408
409 class FileDownloader(object):
410         """File Downloader class.
411
412         File downloader objects are the ones responsible of downloading the
413         actual video file and writing it to disk if the user has requested
414         it, among some other tasks. In most cases there should be one per
415         program. As, given a video URL, the downloader doesn't know how to
416         extract all the needed information, task that InfoExtractors do, it
417         has to pass the URL to one of them.
418
419         For this, file downloader objects have a method that allows
420         InfoExtractors to be registered in a given order. When it is passed
421         a URL, the file downloader handles it to the first InfoExtractor it
422         finds that reports being able to handle it. The InfoExtractor extracts
423         all the information about the video or videos the URL refers to, and
424         asks the FileDownloader to process the video information, possibly
425         downloading the video.
426
427         File downloaders accept a lot of parameters. In order not to saturate
428         the object constructor with arguments, it receives a dictionary of
429         options instead. These options are available through the params
430         attribute for the InfoExtractors to use. The FileDownloader also
431         registers itself as the downloader in charge for the InfoExtractors
432         that are added to it, so this is a "mutual registration".
433
434         Available options:
435
436         username:         Username for authentication purposes.
437         password:         Password for authentication purposes.
438         usenetrc:         Use netrc for authentication instead.
439         quiet:            Do not print messages to stdout.
440         forceurl:         Force printing final URL.
441         forcetitle:       Force printing title.
442         forcethumbnail:   Force printing thumbnail URL.
443         forcedescription: Force printing description.
444         forcefilename:    Force printing final filename.
445         simulate:         Do not download the video files.
446         format:           Video format code.
447         format_limit:     Highest quality format to try.
448         outtmpl:          Template for output names.
449         ignoreerrors:     Do not stop on download errors.
450         ratelimit:        Download speed limit, in bytes/sec.
451         nooverwrites:     Prevent overwriting files.
452         retries:          Number of times to retry for HTTP error 5xx
453         continuedl:       Try to continue downloads if possible.
454         noprogress:       Do not print the progress bar.
455         playliststart:    Playlist item to start at.
456         playlistend:      Playlist item to end at.
457         matchtitle:       Download only matching titles.
458         rejecttitle:      Reject downloads for matching titles.
459         logtostderr:      Log messages to stderr instead of stdout.
460         consoletitle:     Display progress in console window's titlebar.
461         nopart:           Do not use temporary .part files.
462         updatetime:       Use the Last-modified header to set output file timestamps.
463         writedescription: Write the video description to a .description file
464         writeinfojson:    Write the video description to a .info.json file
465         """
466
467         params = None
468         _ies = []
469         _pps = []
470         _download_retcode = None
471         _num_downloads = None
472         _screen_file = None
473
474         def __init__(self, params):
475                 """Create a FileDownloader object with the given options."""
476                 self._ies = []
477                 self._pps = []
478                 self._download_retcode = 0
479                 self._num_downloads = 0
480                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
481                 self.params = params
482
483         @staticmethod
484         def format_bytes(bytes):
485                 if bytes is None:
486                         return 'N/A'
487                 if type(bytes) is str:
488                         bytes = float(bytes)
489                 if bytes == 0.0:
490                         exponent = 0
491                 else:
492                         exponent = long(math.log(bytes, 1024.0))
493                 suffix = 'bkMGTPEZY'[exponent]
494                 converted = float(bytes) / float(1024 ** exponent)
495                 return '%.2f%s' % (converted, suffix)
496
497         @staticmethod
498         def calc_percent(byte_counter, data_len):
499                 if data_len is None:
500                         return '---.-%'
501                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
502
503         @staticmethod
504         def calc_eta(start, now, total, current):
505                 if total is None:
506                         return '--:--'
507                 dif = now - start
508                 if current == 0 or dif < 0.001: # One millisecond
509                         return '--:--'
510                 rate = float(current) / dif
511                 eta = long((float(total) - float(current)) / rate)
512                 (eta_mins, eta_secs) = divmod(eta, 60)
513                 if eta_mins > 99:
514                         return '--:--'
515                 return '%02d:%02d' % (eta_mins, eta_secs)
516
517         @staticmethod
518         def calc_speed(start, now, bytes):
519                 dif = now - start
520                 if bytes == 0 or dif < 0.001: # One millisecond
521                         return '%10s' % '---b/s'
522                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
523
524         @staticmethod
525         def best_block_size(elapsed_time, bytes):
526                 new_min = max(bytes / 2.0, 1.0)
527                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
528                 if elapsed_time < 0.001:
529                         return long(new_max)
530                 rate = bytes / elapsed_time
531                 if rate > new_max:
532                         return long(new_max)
533                 if rate < new_min:
534                         return long(new_min)
535                 return long(rate)
536
537         @staticmethod
538         def parse_bytes(bytestr):
539                 """Parse a string indicating a byte quantity into a long integer."""
540                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
541                 if matchobj is None:
542                         return None
543                 number = float(matchobj.group(1))
544                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
545                 return long(round(number * multiplier))
546
547         def add_info_extractor(self, ie):
548                 """Add an InfoExtractor object to the end of the list."""
549                 self._ies.append(ie)
550                 ie.set_downloader(self)
551
552         def add_post_processor(self, pp):
553                 """Add a PostProcessor object to the end of the chain."""
554                 self._pps.append(pp)
555                 pp.set_downloader(self)
556
557         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
558                 """Print message to stdout if not in quiet mode."""
559                 try:
560                         if not self.params.get('quiet', False):
561                                 terminator = [u'\n', u''][skip_eol]
562                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
563                         self._screen_file.flush()
564                 except (UnicodeEncodeError), err:
565                         if not ignore_encoding_errors:
566                                 raise
567
568         def to_stderr(self, message):
569                 """Print message to stderr."""
570                 print >>sys.stderr, message.encode(preferredencoding())
571
572         def to_cons_title(self, message):
573                 """Set console/terminal window title to message."""
574                 if not self.params.get('consoletitle', False):
575                         return
576                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
577                         # c_wchar_p() might not be necessary if `message` is
578                         # already of type unicode()
579                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
580                 elif 'TERM' in os.environ:
581                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
582
583         def fixed_template(self):
584                 """Checks if the output template is fixed."""
585                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
586
587         def trouble(self, message=None):
588                 """Determine action to take when a download problem appears.
589
590                 Depending on if the downloader has been configured to ignore
591                 download errors or not, this method may throw an exception or
592                 not when errors are found, after printing the message.
593                 """
594                 if message is not None:
595                         self.to_stderr(message)
596                 if not self.params.get('ignoreerrors', False):
597                         raise DownloadError(message)
598                 self._download_retcode = 1
599
600         def slow_down(self, start_time, byte_counter):
601                 """Sleep if the download speed is over the rate limit."""
602                 rate_limit = self.params.get('ratelimit', None)
603                 if rate_limit is None or byte_counter == 0:
604                         return
605                 now = time.time()
606                 elapsed = now - start_time
607                 if elapsed <= 0.0:
608                         return
609                 speed = float(byte_counter) / elapsed
610                 if speed > rate_limit:
611                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
612
613         def temp_name(self, filename):
614                 """Returns a temporary filename for the given filename."""
615                 if self.params.get('nopart', False) or filename == u'-' or \
616                                 (os.path.exists(filename) and not os.path.isfile(filename)):
617                         return filename
618                 return filename + u'.part'
619
620         def undo_temp_name(self, filename):
621                 if filename.endswith(u'.part'):
622                         return filename[:-len(u'.part')]
623                 return filename
624
625         def try_rename(self, old_filename, new_filename):
626                 try:
627                         if old_filename == new_filename:
628                                 return
629                         os.rename(old_filename, new_filename)
630                 except (IOError, OSError), err:
631                         self.trouble(u'ERROR: unable to rename file')
632
633         def try_utime(self, filename, last_modified_hdr):
634                 """Try to set the last-modified time of the given file."""
635                 if last_modified_hdr is None:
636                         return
637                 if not os.path.isfile(filename):
638                         return
639                 timestr = last_modified_hdr
640                 if timestr is None:
641                         return
642                 filetime = timeconvert(timestr)
643                 if filetime is None:
644                         return filetime
645                 try:
646                         os.utime(filename, (time.time(), filetime))
647                 except:
648                         pass
649                 return filetime
650
651         def report_writedescription(self, descfn):
652                 """ Report that the description file is being written """
653                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
654
655         def report_writeinfojson(self, infofn):
656                 """ Report that the metadata file has been written """
657                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
658
659         def report_destination(self, filename):
660                 """Report destination filename."""
661                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
662
663         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
664                 """Report download progress."""
665                 if self.params.get('noprogress', False):
666                         return
667                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
668                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
669                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
670                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
671
672         def report_resuming_byte(self, resume_len):
673                 """Report attempt to resume at given byte."""
674                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
675
676         def report_retry(self, count, retries):
677                 """Report retry in case of HTTP error 5xx"""
678                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
679
680         def report_file_already_downloaded(self, file_name):
681                 """Report file has already been fully downloaded."""
682                 try:
683                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
684                 except (UnicodeEncodeError), err:
685                         self.to_screen(u'[download] The file has already been downloaded')
686
687         def report_unable_to_resume(self):
688                 """Report it was impossible to resume download."""
689                 self.to_screen(u'[download] Unable to resume')
690
691         def report_finish(self):
692                 """Report download finished."""
693                 if self.params.get('noprogress', False):
694                         self.to_screen(u'[download] Download completed')
695                 else:
696                         self.to_screen(u'')
697
698         def increment_downloads(self):
699                 """Increment the ordinal that assigns a number to each file."""
700                 self._num_downloads += 1
701
702         def prepare_filename(self, info_dict):
703                 """Generate the output filename."""
704                 try:
705                         template_dict = dict(info_dict)
706                         template_dict['epoch'] = unicode(long(time.time()))
707                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
708                         filename = self.params['outtmpl'] % template_dict
709                         return filename
710                 except (ValueError, KeyError), err:
711                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
712                         return None
713
714         def _match_entry(self, info_dict):
715                 """ Returns None iff the file should be downloaded """
716
717                 title = info_dict['title']
718                 matchtitle = self.params.get('matchtitle', False)
719                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
720                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
721                 rejecttitle = self.params.get('rejecttitle', False)
722                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
723                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
724                 return None
725
726         def process_info(self, info_dict):
727                 """Process a single dictionary returned by an InfoExtractor."""
728
729                 reason = self._match_entry(info_dict)
730                 if reason is not None:
731                         self.to_screen(u'[download] ' + reason)
732                         return
733
734                 max_downloads = self.params.get('max_downloads')
735                 if max_downloads is not None:
736                         if self._num_downloads > int(max_downloads):
737                                 raise MaxDownloadsReached()
738
739                 filename = self.prepare_filename(info_dict)
740                 
741                 # Forced printings
742                 if self.params.get('forcetitle', False):
743                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
744                 if self.params.get('forceurl', False):
745                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
746                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
747                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
748                 if self.params.get('forcedescription', False) and 'description' in info_dict:
749                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
750                 if self.params.get('forcefilename', False) and filename is not None:
751                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
752                 if self.params.get('forceformat', False):
753                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
754
755                 # Do nothing else if in simulate mode
756                 if self.params.get('simulate', False):
757                         return
758
759                 if filename is None:
760                         return
761
762                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
763                         self.to_stderr(u'WARNING: file exists and will be skipped')
764                         return
765
766                 try:
767                         dn = os.path.dirname(filename)
768                         if dn != '' and not os.path.exists(dn):
769                                 os.makedirs(dn)
770                 except (OSError, IOError), err:
771                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
772                         return
773
774                 if self.params.get('writedescription', False):
775                         try:
776                                 descfn = filename + '.description'
777                                 self.report_writedescription(descfn)
778                                 descfile = open(descfn, 'wb')
779                                 try:
780                                         descfile.write(info_dict['description'].encode('utf-8'))
781                                 finally:
782                                         descfile.close()
783                         except (OSError, IOError):
784                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
785                                 return
786
787                 if self.params.get('writeinfojson', False):
788                         infofn = filename + '.info.json'
789                         self.report_writeinfojson(infofn)
790                         try:
791                                 json.dump
792                         except (NameError,AttributeError):
793                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
794                                 return
795                         try:
796                                 infof = open(infofn, 'wb')
797                                 try:
798                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
799                                         json.dump(json_info_dict, infof)
800                                 finally:
801                                         infof.close()
802                         except (OSError, IOError):
803                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
804                                 return
805
806                 if not self.params.get('skip_download', False):
807                         try:
808                                 success = self._do_download(filename, info_dict)
809                         except (OSError, IOError), err:
810                                 raise UnavailableVideoError
811                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
812                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
813                                 return
814                         except (ContentTooShortError, ), err:
815                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
816                                 return
817         
818                         if success:
819                                 try:
820                                         self.post_process(filename, info_dict)
821                                 except (PostProcessingError), err:
822                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
823                                         return
824
825         def download(self, url_list):
826                 """Download a given list of URLs."""
827                 if len(url_list) > 1 and self.fixed_template():
828                         raise SameFileError(self.params['outtmpl'])
829
830                 for url in url_list:
831                         suitable_found = False
832                         for ie in self._ies:
833                                 # Go to next InfoExtractor if not suitable
834                                 if not ie.suitable(url):
835                                         continue
836
837                                 # Suitable InfoExtractor found
838                                 suitable_found = True
839
840                                 # Extract information from URL and process it
841                                 ie.extract(url)
842
843                                 # Suitable InfoExtractor had been found; go to next URL
844                                 break
845
846                         if not suitable_found:
847                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
848
849                 return self._download_retcode
850
851         def post_process(self, filename, ie_info):
852                 """Run the postprocessing chain on the given file."""
853                 info = dict(ie_info)
854                 info['filepath'] = filename
855                 for pp in self._pps:
856                         info = pp.run(info)
857                         if info is None:
858                                 break
859
860         def _download_with_rtmpdump(self, filename, url, player_url):
861                 self.report_destination(filename)
862                 tmpfilename = self.temp_name(filename)
863
864                 # Check for rtmpdump first
865                 try:
866                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
867                 except (OSError, IOError):
868                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
869                         return False
870
871                 # Download using rtmpdump. rtmpdump returns exit code 2 when
872                 # the connection was interrumpted and resuming appears to be
873                 # possible. This is part of rtmpdump's normal usage, AFAIK.
874                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
875                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
876                 while retval == 2 or retval == 1:
877                         prevsize = os.path.getsize(tmpfilename)
878                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
879                         time.sleep(5.0) # This seems to be needed
880                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
881                         cursize = os.path.getsize(tmpfilename)
882                         if prevsize == cursize and retval == 1:
883                                 break
884                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
885                         if prevsize == cursize and retval == 2 and cursize > 1024:
886                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
887                                 retval = 0
888                                 break
889                 if retval == 0:
890                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
891                         self.try_rename(tmpfilename, filename)
892                         return True
893                 else:
894                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
895                         return False
896
897         def _do_download(self, filename, info_dict):
898                 url = info_dict['url']
899                 player_url = info_dict.get('player_url', None)
900
901                 # Check file already present
902                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
903                         self.report_file_already_downloaded(filename)
904                         return True
905
906                 # Attempt to download using rtmpdump
907                 if url.startswith('rtmp'):
908                         return self._download_with_rtmpdump(filename, url, player_url)
909
910                 tmpfilename = self.temp_name(filename)
911                 stream = None
912
913                 # Do not include the Accept-Encoding header
914                 headers = {'Youtubedl-no-compression': 'True'}
915                 basic_request = urllib2.Request(url, None, headers)
916                 request = urllib2.Request(url, None, headers)
917
918                 # Establish possible resume length
919                 if os.path.isfile(tmpfilename):
920                         resume_len = os.path.getsize(tmpfilename)
921                 else:
922                         resume_len = 0
923
924                 open_mode = 'wb'
925                 if resume_len != 0:
926                         if self.params.get('continuedl', False):
927                                 self.report_resuming_byte(resume_len)
928                                 request.add_header('Range','bytes=%d-' % resume_len)
929                                 open_mode = 'ab'
930                         else:
931                                 resume_len = 0
932
933                 count = 0
934                 retries = self.params.get('retries', 0)
935                 while count <= retries:
936                         # Establish connection
937                         try:
938                                 if count == 0 and 'urlhandle' in info_dict:
939                                         data = info_dict['urlhandle']
940                                 data = urllib2.urlopen(request)
941                                 break
942                         except (urllib2.HTTPError, ), err:
943                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
944                                         # Unexpected HTTP error
945                                         raise
946                                 elif err.code == 416:
947                                         # Unable to resume (requested range not satisfiable)
948                                         try:
949                                                 # Open the connection again without the range header
950                                                 data = urllib2.urlopen(basic_request)
951                                                 content_length = data.info()['Content-Length']
952                                         except (urllib2.HTTPError, ), err:
953                                                 if err.code < 500 or err.code >= 600:
954                                                         raise
955                                         else:
956                                                 # Examine the reported length
957                                                 if (content_length is not None and
958                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
959                                                         # The file had already been fully downloaded.
960                                                         # Explanation to the above condition: in issue #175 it was revealed that
961                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
962                                                         # changing the file size slightly and causing problems for some users. So
963                                                         # I decided to implement a suggested change and consider the file
964                                                         # completely downloaded if the file size differs less than 100 bytes from
965                                                         # the one in the hard drive.
966                                                         self.report_file_already_downloaded(filename)
967                                                         self.try_rename(tmpfilename, filename)
968                                                         return True
969                                                 else:
970                                                         # The length does not match, we start the download over
971                                                         self.report_unable_to_resume()
972                                                         open_mode = 'wb'
973                                                         break
974                         # Retry
975                         count += 1
976                         if count <= retries:
977                                 self.report_retry(count, retries)
978
979                 if count > retries:
980                         self.trouble(u'ERROR: giving up after %s retries' % retries)
981                         return False
982
983                 data_len = data.info().get('Content-length', None)
984                 if data_len is not None:
985                         data_len = long(data_len) + resume_len
986                 data_len_str = self.format_bytes(data_len)
987                 byte_counter = 0 + resume_len
988                 block_size = 1024
989                 start = time.time()
990                 while True:
991                         # Download and write
992                         before = time.time()
993                         data_block = data.read(block_size)
994                         after = time.time()
995                         if len(data_block) == 0:
996                                 break
997                         byte_counter += len(data_block)
998
999                         # Open file just in time
1000                         if stream is None:
1001                                 try:
1002                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1003                                         assert stream is not None
1004                                         filename = self.undo_temp_name(tmpfilename)
1005                                         self.report_destination(filename)
1006                                 except (OSError, IOError), err:
1007                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1008                                         return False
1009                         try:
1010                                 stream.write(data_block)
1011                         except (IOError, OSError), err:
1012                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1013                                 return False
1014                         block_size = self.best_block_size(after - before, len(data_block))
1015
1016                         # Progress message
1017                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1018                         if data_len is None:
1019                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1020                         else:
1021                                 percent_str = self.calc_percent(byte_counter, data_len)
1022                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1023                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1024
1025                         # Apply rate limit
1026                         self.slow_down(start, byte_counter - resume_len)
1027
1028                 if stream is None:
1029                         self.trouble(u'\nERROR: Did not get any data blocks')
1030                         return False
1031                 stream.close()
1032                 self.report_finish()
1033                 if data_len is not None and byte_counter != data_len:
1034                         raise ContentTooShortError(byte_counter, long(data_len))
1035                 self.try_rename(tmpfilename, filename)
1036
1037                 # Update file modification time
1038                 if self.params.get('updatetime', True):
1039                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1040
1041                 return True
1042
1043
1044 class InfoExtractor(object):
1045         """Information Extractor class.
1046
1047         Information extractors are the classes that, given a URL, extract
1048         information from the video (or videos) the URL refers to. This
1049         information includes the real video URL, the video title and simplified
1050         title, author and others. The information is stored in a dictionary
1051         which is then passed to the FileDownloader. The FileDownloader
1052         processes this information possibly downloading the video to the file
1053         system, among other possible outcomes. The dictionaries must include
1054         the following fields:
1055
1056         id:             Video identifier.
1057         url:            Final video URL.
1058         uploader:       Nickname of the video uploader.
1059         title:          Literal title.
1060         stitle:         Simplified title.
1061         ext:            Video filename extension.
1062         format:         Video format.
1063         player_url:     SWF Player URL (may be None).
1064
1065         The following fields are optional. Their primary purpose is to allow
1066         youtube-dl to serve as the backend for a video search function, such
1067         as the one in youtube2mp3.  They are only used when their respective
1068         forced printing functions are called:
1069
1070         thumbnail:      Full URL to a video thumbnail image.
1071         description:    One-line video description.
1072
1073         Subclasses of this one should re-define the _real_initialize() and
1074         _real_extract() methods and define a _VALID_URL regexp.
1075         Probably, they should also be added to the list of extractors.
1076         """
1077
1078         _ready = False
1079         _downloader = None
1080
1081         def __init__(self, downloader=None):
1082                 """Constructor. Receives an optional downloader."""
1083                 self._ready = False
1084                 self.set_downloader(downloader)
1085
1086         def suitable(self, url):
1087                 """Receives a URL and returns True if suitable for this IE."""
1088                 return re.match(self._VALID_URL, url) is not None
1089
1090         def initialize(self):
1091                 """Initializes an instance (authentication, etc)."""
1092                 if not self._ready:
1093                         self._real_initialize()
1094                         self._ready = True
1095
1096         def extract(self, url):
1097                 """Extracts URL information and returns it in list of dicts."""
1098                 self.initialize()
1099                 return self._real_extract(url)
1100
1101         def set_downloader(self, downloader):
1102                 """Sets the downloader for this IE."""
1103                 self._downloader = downloader
1104
1105         def _real_initialize(self):
1106                 """Real initialization process. Redefine in subclasses."""
1107                 pass
1108
1109         def _real_extract(self, url):
1110                 """Real extraction process. Redefine in subclasses."""
1111                 pass
1112
1113
1114 class YoutubeIE(InfoExtractor):
1115         """Information extractor for youtube.com."""
1116
1117         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1118         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1119         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1120         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1121         _NETRC_MACHINE = 'youtube'
1122         # Listed in order of quality
1123         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1124         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1125         _video_extensions = {
1126                 '13': '3gp',
1127                 '17': 'mp4',
1128                 '18': 'mp4',
1129                 '22': 'mp4',
1130                 '37': 'mp4',
1131                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1132                 '43': 'webm',
1133                 '44': 'webm',
1134                 '45': 'webm',
1135         }
1136         _video_dimensions = {
1137                 '5': '240x400',
1138                 '6': '???',
1139                 '13': '???',
1140                 '17': '144x176',
1141                 '18': '360x640',
1142                 '22': '720x1280',
1143                 '34': '360x640',
1144                 '35': '480x854',
1145                 '37': '1080x1920',
1146                 '38': '3072x4096',
1147                 '43': '360x640',
1148                 '44': '480x854',
1149                 '45': '720x1280',
1150         }       
1151         IE_NAME = u'youtube'
1152
1153         def report_lang(self):
1154                 """Report attempt to set language."""
1155                 self._downloader.to_screen(u'[youtube] Setting language')
1156
1157         def report_login(self):
1158                 """Report attempt to log in."""
1159                 self._downloader.to_screen(u'[youtube] Logging in')
1160
1161         def report_age_confirmation(self):
1162                 """Report attempt to confirm age."""
1163                 self._downloader.to_screen(u'[youtube] Confirming age')
1164
1165         def report_video_webpage_download(self, video_id):
1166                 """Report attempt to download video webpage."""
1167                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1168
1169         def report_video_info_webpage_download(self, video_id):
1170                 """Report attempt to download video info webpage."""
1171                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1172
1173         def report_information_extraction(self, video_id):
1174                 """Report attempt to extract video information."""
1175                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1176
1177         def report_unavailable_format(self, video_id, format):
1178                 """Report extracted video URL."""
1179                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1180
1181         def report_rtmp_download(self):
1182                 """Indicate the download will use the RTMP protocol."""
1183                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1184
1185         def _print_formats(self, formats):
1186                 print 'Available formats:'
1187                 for x in formats:
1188                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1189
1190         def _real_initialize(self):
1191                 if self._downloader is None:
1192                         return
1193
1194                 username = None
1195                 password = None
1196                 downloader_params = self._downloader.params
1197
1198                 # Attempt to use provided username and password or .netrc data
1199                 if downloader_params.get('username', None) is not None:
1200                         username = downloader_params['username']
1201                         password = downloader_params['password']
1202                 elif downloader_params.get('usenetrc', False):
1203                         try:
1204                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1205                                 if info is not None:
1206                                         username = info[0]
1207                                         password = info[2]
1208                                 else:
1209                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1210                         except (IOError, netrc.NetrcParseError), err:
1211                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1212                                 return
1213
1214                 # Set language
1215                 request = urllib2.Request(self._LANG_URL)
1216                 try:
1217                         self.report_lang()
1218                         urllib2.urlopen(request).read()
1219                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1221                         return
1222
1223                 # No authentication to be performed
1224                 if username is None:
1225                         return
1226
1227                 # Log in
1228                 login_form = {
1229                                 'current_form': 'loginForm',
1230                                 'next':         '/',
1231                                 'action_login': 'Log In',
1232                                 'username':     username,
1233                                 'password':     password,
1234                                 }
1235                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1236                 try:
1237                         self.report_login()
1238                         login_results = urllib2.urlopen(request).read()
1239                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1240                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1241                                 return
1242                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1244                         return
1245
1246                 # Confirm age
1247                 age_form = {
1248                                 'next_url':             '/',
1249                                 'action_confirm':       'Confirm',
1250                                 }
1251                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1252                 try:
1253                         self.report_age_confirmation()
1254                         age_results = urllib2.urlopen(request).read()
1255                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1257                         return
1258
1259         def _real_extract(self, url):
1260                 # Extract video id from URL
1261                 mobj = re.match(self._VALID_URL, url)
1262                 if mobj is None:
1263                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1264                         return
1265                 video_id = mobj.group(2)
1266
1267                 # Get video webpage
1268                 self.report_video_webpage_download(video_id)
1269                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1270                 try:
1271                         video_webpage = urllib2.urlopen(request).read()
1272                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1273                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1274                         return
1275
1276                 # Attempt to extract SWF player URL
1277                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1278                 if mobj is not None:
1279                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1280                 else:
1281                         player_url = None
1282
1283                 # Get video info
1284                 self.report_video_info_webpage_download(video_id)
1285                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1286                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1287                                         % (video_id, el_type))
1288                         request = urllib2.Request(video_info_url)
1289                         try:
1290                                 video_info_webpage = urllib2.urlopen(request).read()
1291                                 video_info = parse_qs(video_info_webpage)
1292                                 if 'token' in video_info:
1293                                         break
1294                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1295                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1296                                 return
1297                 if 'token' not in video_info:
1298                         if 'reason' in video_info:
1299                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1300                         else:
1301                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1302                         return
1303
1304                 # Start extracting information
1305                 self.report_information_extraction(video_id)
1306
1307                 # uploader
1308                 if 'author' not in video_info:
1309                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1310                         return
1311                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1312
1313                 # title
1314                 if 'title' not in video_info:
1315                         self._downloader.trouble(u'ERROR: unable to extract video title')
1316                         return
1317                 video_title = urllib.unquote_plus(video_info['title'][0])
1318                 video_title = video_title.decode('utf-8')
1319                 video_title = sanitize_title(video_title)
1320
1321                 # simplified title
1322                 simple_title = _simplify_title(video_title)
1323
1324                 # thumbnail image
1325                 if 'thumbnail_url' not in video_info:
1326                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1327                         video_thumbnail = ''
1328                 else:   # don't panic if we can't find it
1329                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1330
1331                 # upload date
1332                 upload_date = u'NA'
1333                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1334                 if mobj is not None:
1335                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1336                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1337                         for expression in format_expressions:
1338                                 try:
1339                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1340                                 except:
1341                                         pass
1342
1343                 # description
1344                 try:
1345                         lxml.etree
1346                 except NameError:
1347                         video_description = u'No description available.'
1348                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1349                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1350                                 if mobj is not None:
1351                                         video_description = mobj.group(1).decode('utf-8')
1352                 else:
1353                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1354                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1355                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1356                         # TODO use another parser
1357
1358                 # token
1359                 video_token = urllib.unquote_plus(video_info['token'][0])
1360
1361                 # Decide which formats to download
1362                 req_format = self._downloader.params.get('format', None)
1363
1364                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1365                         self.report_rtmp_download()
1366                         video_url_list = [(None, video_info['conn'][0])]
1367                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1368                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1369                         url_data = [parse_qs(uds) for uds in url_data_strs]
1370                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1371                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1372
1373                         format_limit = self._downloader.params.get('format_limit', None)
1374                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1375                         if format_limit is not None and format_limit in available_formats:
1376                                 format_list = available_formats[available_formats.index(format_limit):]
1377                         else:
1378                                 format_list = available_formats
1379                         existing_formats = [x for x in format_list if x in url_map]
1380                         if len(existing_formats) == 0:
1381                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1382                                 return
1383                         if self._downloader.params.get('listformats', None):
1384                                 self._print_formats(existing_formats)
1385                                 return
1386                         if req_format is None or req_format == 'best':
1387                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1388                         elif req_format == 'worst':
1389                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1390                         elif req_format in ('-1', 'all'):
1391                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1392                         else:
1393                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1394                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1395                                 req_formats = req_format.split('/')
1396                                 video_url_list = None
1397                                 for rf in req_formats:
1398                                         if rf in url_map:
1399                                                 video_url_list = [(rf, url_map[rf])]
1400                                                 break
1401                                 if video_url_list is None:
1402                                         self._downloader.trouble(u'ERROR: requested format not available')
1403                                         return
1404                 else:
1405                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1406                         return
1407
1408                 for format_param, video_real_url in video_url_list:
1409                         # At this point we have a new video
1410                         self._downloader.increment_downloads()
1411
1412                         # Extension
1413                         video_extension = self._video_extensions.get(format_param, 'flv')
1414
1415                         try:
1416                                 # Process video information
1417                                 self._downloader.process_info({
1418                                         'id':           video_id.decode('utf-8'),
1419                                         'url':          video_real_url.decode('utf-8'),
1420                                         'uploader':     video_uploader.decode('utf-8'),
1421                                         'upload_date':  upload_date,
1422                                         'title':        video_title,
1423                                         'stitle':       simple_title,
1424                                         'ext':          video_extension.decode('utf-8'),
1425                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1426                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1427                                         'description':  video_description,
1428                                         'player_url':   player_url,
1429                                 })
1430                         except UnavailableVideoError, err:
1431                                 self._downloader.trouble(u'\nERROR: unable to download video')
1432
1433
1434 class MetacafeIE(InfoExtractor):
1435         """Information Extractor for metacafe.com."""
1436
1437         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1438         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1439         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1440         _youtube_ie = None
1441         IE_NAME = u'metacafe'
1442
1443         def __init__(self, youtube_ie, downloader=None):
1444                 InfoExtractor.__init__(self, downloader)
1445                 self._youtube_ie = youtube_ie
1446
1447         def report_disclaimer(self):
1448                 """Report disclaimer retrieval."""
1449                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1450
1451         def report_age_confirmation(self):
1452                 """Report attempt to confirm age."""
1453                 self._downloader.to_screen(u'[metacafe] Confirming age')
1454
1455         def report_download_webpage(self, video_id):
1456                 """Report webpage download."""
1457                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1458
1459         def report_extraction(self, video_id):
1460                 """Report information extraction."""
1461                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1462
1463         def _real_initialize(self):
1464                 # Retrieve disclaimer
1465                 request = urllib2.Request(self._DISCLAIMER)
1466                 try:
1467                         self.report_disclaimer()
1468                         disclaimer = urllib2.urlopen(request).read()
1469                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1470                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1471                         return
1472
1473                 # Confirm age
1474                 disclaimer_form = {
1475                         'filters': '0',
1476                         'submit': "Continue - I'm over 18",
1477                         }
1478                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1479                 try:
1480                         self.report_age_confirmation()
1481                         disclaimer = urllib2.urlopen(request).read()
1482                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1483                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1484                         return
1485
1486         def _real_extract(self, url):
1487                 # Extract id and simplified title from URL
1488                 mobj = re.match(self._VALID_URL, url)
1489                 if mobj is None:
1490                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1491                         return
1492
1493                 video_id = mobj.group(1)
1494
1495                 # Check if video comes from YouTube
1496                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1497                 if mobj2 is not None:
1498                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1499                         return
1500
1501                 # At this point we have a new video
1502                 self._downloader.increment_downloads()
1503
1504                 simple_title = mobj.group(2).decode('utf-8')
1505
1506                 # Retrieve video webpage to extract further information
1507                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1508                 try:
1509                         self.report_download_webpage(video_id)
1510                         webpage = urllib2.urlopen(request).read()
1511                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1512                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1513                         return
1514
1515                 # Extract URL, uploader and title from webpage
1516                 self.report_extraction(video_id)
1517                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1518                 if mobj is not None:
1519                         mediaURL = urllib.unquote(mobj.group(1))
1520                         video_extension = mediaURL[-3:]
1521
1522                         # Extract gdaKey if available
1523                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1524                         if mobj is None:
1525                                 video_url = mediaURL
1526                         else:
1527                                 gdaKey = mobj.group(1)
1528                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1529                 else:
1530                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1531                         if mobj is None:
1532                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1533                                 return
1534                         vardict = parse_qs(mobj.group(1))
1535                         if 'mediaData' not in vardict:
1536                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1537                                 return
1538                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1539                         if mobj is None:
1540                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1541                                 return
1542                         mediaURL = mobj.group(1).replace('\\/', '/')
1543                         video_extension = mediaURL[-3:]
1544                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1545
1546                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1547                 if mobj is None:
1548                         self._downloader.trouble(u'ERROR: unable to extract title')
1549                         return
1550                 video_title = mobj.group(1).decode('utf-8')
1551                 video_title = sanitize_title(video_title)
1552
1553                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1554                 if mobj is None:
1555                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1556                         return
1557                 video_uploader = mobj.group(1)
1558
1559                 try:
1560                         # Process video information
1561                         self._downloader.process_info({
1562                                 'id':           video_id.decode('utf-8'),
1563                                 'url':          video_url.decode('utf-8'),
1564                                 'uploader':     video_uploader.decode('utf-8'),
1565                                 'upload_date':  u'NA',
1566                                 'title':        video_title,
1567                                 'stitle':       simple_title,
1568                                 'ext':          video_extension.decode('utf-8'),
1569                                 'format':       u'NA',
1570                                 'player_url':   None,
1571                         })
1572                 except UnavailableVideoError:
1573                         self._downloader.trouble(u'\nERROR: unable to download video')
1574
1575
1576 class DailymotionIE(InfoExtractor):
1577         """Information Extractor for Dailymotion"""
1578
1579         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1580         IE_NAME = u'dailymotion'
1581
1582         def __init__(self, downloader=None):
1583                 InfoExtractor.__init__(self, downloader)
1584
1585         def report_download_webpage(self, video_id):
1586                 """Report webpage download."""
1587                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1588
1589         def report_extraction(self, video_id):
1590                 """Report information extraction."""
1591                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1592
1593         def _real_extract(self, url):
1594                 # Extract id and simplified title from URL
1595                 mobj = re.match(self._VALID_URL, url)
1596                 if mobj is None:
1597                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1598                         return
1599
1600                 # At this point we have a new video
1601                 self._downloader.increment_downloads()
1602                 video_id = mobj.group(1)
1603
1604                 simple_title = mobj.group(2).decode('utf-8')
1605                 video_extension = 'flv'
1606
1607                 # Retrieve video webpage to extract further information
1608                 request = urllib2.Request(url)
1609                 request.add_header('Cookie', 'family_filter=off')
1610                 try:
1611                         self.report_download_webpage(video_id)
1612                         webpage = urllib2.urlopen(request).read()
1613                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1614                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1615                         return
1616
1617                 # Extract URL, uploader and title from webpage
1618                 self.report_extraction(video_id)
1619                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1620                 if mobj is None:
1621                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1622                         return
1623                 sequence = urllib.unquote(mobj.group(1))
1624                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1625                 if mobj is None:
1626                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1627                         return
1628                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1629
1630                 # if needed add http://www.dailymotion.com/ if relative URL
1631
1632                 video_url = mediaURL
1633
1634                 mobj = re.search(r'(?im)<title>\s*(.+)\s*-\s*Video\s+Dailymotion</title>', webpage)
1635                 if mobj is None:
1636                         self._downloader.trouble(u'ERROR: unable to extract title')
1637                         return
1638                 video_title = mobj.group(1).decode('utf-8')
1639                 video_title = sanitize_title(video_title)
1640
1641                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1642                 if mobj is None:
1643                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1644                         return
1645                 video_uploader = mobj.group(1)
1646
1647                 try:
1648                         # Process video information
1649                         self._downloader.process_info({
1650                                 'id':           video_id.decode('utf-8'),
1651                                 'url':          video_url.decode('utf-8'),
1652                                 'uploader':     video_uploader.decode('utf-8'),
1653                                 'upload_date':  u'NA',
1654                                 'title':        video_title,
1655                                 'stitle':       simple_title,
1656                                 'ext':          video_extension.decode('utf-8'),
1657                                 'format':       u'NA',
1658                                 'player_url':   None,
1659                         })
1660                 except UnavailableVideoError:
1661                         self._downloader.trouble(u'\nERROR: unable to download video')
1662
1663
1664 class GoogleIE(InfoExtractor):
1665         """Information extractor for video.google.com."""
1666
1667         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1668         IE_NAME = u'video.google'
1669
1670         def __init__(self, downloader=None):
1671                 InfoExtractor.__init__(self, downloader)
1672
1673         def report_download_webpage(self, video_id):
1674                 """Report webpage download."""
1675                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1676
1677         def report_extraction(self, video_id):
1678                 """Report information extraction."""
1679                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1680
1681         def _real_extract(self, url):
1682                 # Extract id from URL
1683                 mobj = re.match(self._VALID_URL, url)
1684                 if mobj is None:
1685                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1686                         return
1687
1688                 # At this point we have a new video
1689                 self._downloader.increment_downloads()
1690                 video_id = mobj.group(1)
1691
1692                 video_extension = 'mp4'
1693
1694                 # Retrieve video webpage to extract further information
1695                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1696                 try:
1697                         self.report_download_webpage(video_id)
1698                         webpage = urllib2.urlopen(request).read()
1699                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1700                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1701                         return
1702
1703                 # Extract URL, uploader, and title from webpage
1704                 self.report_extraction(video_id)
1705                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1706                 if mobj is None:
1707                         video_extension = 'flv'
1708                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1709                 if mobj is None:
1710                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1711                         return
1712                 mediaURL = urllib.unquote(mobj.group(1))
1713                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1714                 mediaURL = mediaURL.replace('\\x26', '\x26')
1715
1716                 video_url = mediaURL
1717
1718                 mobj = re.search(r'<title>(.*)</title>', webpage)
1719                 if mobj is None:
1720                         self._downloader.trouble(u'ERROR: unable to extract title')
1721                         return
1722                 video_title = mobj.group(1).decode('utf-8')
1723                 video_title = sanitize_title(video_title)
1724                 simple_title = _simplify_title(video_title)
1725
1726                 # Extract video description
1727                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1728                 if mobj is None:
1729                         self._downloader.trouble(u'ERROR: unable to extract video description')
1730                         return
1731                 video_description = mobj.group(1).decode('utf-8')
1732                 if not video_description:
1733                         video_description = 'No description available.'
1734
1735                 # Extract video thumbnail
1736                 if self._downloader.params.get('forcethumbnail', False):
1737                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1738                         try:
1739                                 webpage = urllib2.urlopen(request).read()
1740                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1741                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1742                                 return
1743                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1744                         if mobj is None:
1745                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1746                                 return
1747                         video_thumbnail = mobj.group(1)
1748                 else:   # we need something to pass to process_info
1749                         video_thumbnail = ''
1750
1751                 try:
1752                         # Process video information
1753                         self._downloader.process_info({
1754                                 'id':           video_id.decode('utf-8'),
1755                                 'url':          video_url.decode('utf-8'),
1756                                 'uploader':     u'NA',
1757                                 'upload_date':  u'NA',
1758                                 'title':        video_title,
1759                                 'stitle':       simple_title,
1760                                 'ext':          video_extension.decode('utf-8'),
1761                                 'format':       u'NA',
1762                                 'player_url':   None,
1763                         })
1764                 except UnavailableVideoError:
1765                         self._downloader.trouble(u'\nERROR: unable to download video')
1766
1767
1768 class PhotobucketIE(InfoExtractor):
1769         """Information extractor for photobucket.com."""
1770
1771         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1772         IE_NAME = u'photobucket'
1773
1774         def __init__(self, downloader=None):
1775                 InfoExtractor.__init__(self, downloader)
1776
1777         def report_download_webpage(self, video_id):
1778                 """Report webpage download."""
1779                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1780
1781         def report_extraction(self, video_id):
1782                 """Report information extraction."""
1783                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1784
1785         def _real_extract(self, url):
1786                 # Extract id from URL
1787                 mobj = re.match(self._VALID_URL, url)
1788                 if mobj is None:
1789                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1790                         return
1791
1792                 # At this point we have a new video
1793                 self._downloader.increment_downloads()
1794                 video_id = mobj.group(1)
1795
1796                 video_extension = 'flv'
1797
1798                 # Retrieve video webpage to extract further information
1799                 request = urllib2.Request(url)
1800                 try:
1801                         self.report_download_webpage(video_id)
1802                         webpage = urllib2.urlopen(request).read()
1803                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1804                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1805                         return
1806
1807                 # Extract URL, uploader, and title from webpage
1808                 self.report_extraction(video_id)
1809                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1810                 if mobj is None:
1811                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1812                         return
1813                 mediaURL = urllib.unquote(mobj.group(1))
1814
1815                 video_url = mediaURL
1816
1817                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1818                 if mobj is None:
1819                         self._downloader.trouble(u'ERROR: unable to extract title')
1820                         return
1821                 video_title = mobj.group(1).decode('utf-8')
1822                 video_title = sanitize_title(video_title)
1823                 simple_title = _simplify_title(vide_title)
1824
1825                 video_uploader = mobj.group(2).decode('utf-8')
1826
1827                 try:
1828                         # Process video information
1829                         self._downloader.process_info({
1830                                 'id':           video_id.decode('utf-8'),
1831                                 'url':          video_url.decode('utf-8'),
1832                                 'uploader':     video_uploader,
1833                                 'upload_date':  u'NA',
1834                                 'title':        video_title,
1835                                 'stitle':       simple_title,
1836                                 'ext':          video_extension.decode('utf-8'),
1837                                 'format':       u'NA',
1838                                 'player_url':   None,
1839                         })
1840                 except UnavailableVideoError:
1841                         self._downloader.trouble(u'\nERROR: unable to download video')
1842
1843
1844 class YahooIE(InfoExtractor):
1845         """Information extractor for video.yahoo.com."""
1846
1847         # _VALID_URL matches all Yahoo! Video URLs
1848         # _VPAGE_URL matches only the extractable '/watch/' URLs
1849         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1850         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1851         IE_NAME = u'video.yahoo'
1852
1853         def __init__(self, downloader=None):
1854                 InfoExtractor.__init__(self, downloader)
1855
1856         def report_download_webpage(self, video_id):
1857                 """Report webpage download."""
1858                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1859
1860         def report_extraction(self, video_id):
1861                 """Report information extraction."""
1862                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1863
1864         def _real_extract(self, url, new_video=True):
1865                 # Extract ID from URL
1866                 mobj = re.match(self._VALID_URL, url)
1867                 if mobj is None:
1868                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1869                         return
1870
1871                 # At this point we have a new video
1872                 self._downloader.increment_downloads()
1873                 video_id = mobj.group(2)
1874                 video_extension = 'flv'
1875
1876                 # Rewrite valid but non-extractable URLs as
1877                 # extractable English language /watch/ URLs
1878                 if re.match(self._VPAGE_URL, url) is None:
1879                         request = urllib2.Request(url)
1880                         try:
1881                                 webpage = urllib2.urlopen(request).read()
1882                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1883                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1884                                 return
1885
1886                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1887                         if mobj is None:
1888                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1889                                 return
1890                         yahoo_id = mobj.group(1)
1891
1892                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1893                         if mobj is None:
1894                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1895                                 return
1896                         yahoo_vid = mobj.group(1)
1897
1898                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1899                         return self._real_extract(url, new_video=False)
1900
1901                 # Retrieve video webpage to extract further information
1902                 request = urllib2.Request(url)
1903                 try:
1904                         self.report_download_webpage(video_id)
1905                         webpage = urllib2.urlopen(request).read()
1906                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1907                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1908                         return
1909
1910                 # Extract uploader and title from webpage
1911                 self.report_extraction(video_id)
1912                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1913                 if mobj is None:
1914                         self._downloader.trouble(u'ERROR: unable to extract video title')
1915                         return
1916                 video_title = mobj.group(1).decode('utf-8')
1917                 simple_title = _simplify_title(video_title)
1918
1919                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1920                 if mobj is None:
1921                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1922                         return
1923                 video_uploader = mobj.group(1).decode('utf-8')
1924
1925                 # Extract video thumbnail
1926                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1927                 if mobj is None:
1928                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1929                         return
1930                 video_thumbnail = mobj.group(1).decode('utf-8')
1931
1932                 # Extract video description
1933                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1934                 if mobj is None:
1935                         self._downloader.trouble(u'ERROR: unable to extract video description')
1936                         return
1937                 video_description = mobj.group(1).decode('utf-8')
1938                 if not video_description:
1939                         video_description = 'No description available.'
1940
1941                 # Extract video height and width
1942                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1943                 if mobj is None:
1944                         self._downloader.trouble(u'ERROR: unable to extract video height')
1945                         return
1946                 yv_video_height = mobj.group(1)
1947
1948                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1949                 if mobj is None:
1950                         self._downloader.trouble(u'ERROR: unable to extract video width')
1951                         return
1952                 yv_video_width = mobj.group(1)
1953
1954                 # Retrieve video playlist to extract media URL
1955                 # I'm not completely sure what all these options are, but we
1956                 # seem to need most of them, otherwise the server sends a 401.
1957                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1958                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1959                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1960                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1961                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1962                 try:
1963                         self.report_download_webpage(video_id)
1964                         webpage = urllib2.urlopen(request).read()
1965                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1966                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1967                         return
1968
1969                 # Extract media URL from playlist XML
1970                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1971                 if mobj is None:
1972                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1973                         return
1974                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1975                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1976
1977                 try:
1978                         # Process video information
1979                         self._downloader.process_info({
1980                                 'id':           video_id.decode('utf-8'),
1981                                 'url':          video_url,
1982                                 'uploader':     video_uploader,
1983                                 'upload_date':  u'NA',
1984                                 'title':        video_title,
1985                                 'stitle':       simple_title,
1986                                 'ext':          video_extension.decode('utf-8'),
1987                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1988                                 'description':  video_description,
1989                                 'thumbnail':    video_thumbnail,
1990                                 'player_url':   None,
1991                         })
1992                 except UnavailableVideoError:
1993                         self._downloader.trouble(u'\nERROR: unable to download video')
1994
1995
1996 class VimeoIE(InfoExtractor):
1997         """Information extractor for vimeo.com."""
1998
1999         # _VALID_URL matches Vimeo URLs
2000         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2001         IE_NAME = u'vimeo'
2002
2003         def __init__(self, downloader=None):
2004                 InfoExtractor.__init__(self, downloader)
2005
2006         def report_download_webpage(self, video_id):
2007                 """Report webpage download."""
2008                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2009
2010         def report_extraction(self, video_id):
2011                 """Report information extraction."""
2012                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2013
2014         def _real_extract(self, url, new_video=True):
2015                 # Extract ID from URL
2016                 mobj = re.match(self._VALID_URL, url)
2017                 if mobj is None:
2018                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2019                         return
2020
2021                 # At this point we have a new video
2022                 self._downloader.increment_downloads()
2023                 video_id = mobj.group(1)
2024
2025                 # Retrieve video webpage to extract further information
2026                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2027                 try:
2028                         self.report_download_webpage(video_id)
2029                         webpage = urllib2.urlopen(request).read()
2030                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2031                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2032                         return
2033
2034                 # Now we begin extracting as much information as we can from what we
2035                 # retrieved. First we extract the information common to all extractors,
2036                 # and latter we extract those that are Vimeo specific.
2037                 self.report_extraction(video_id)
2038
2039                 # Extract title
2040                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2041                 if mobj is None:
2042                         self._downloader.trouble(u'ERROR: unable to extract video title')
2043                         return
2044                 video_title = mobj.group(1).decode('utf-8')
2045                 simple_title = _simplify_title(video_title)
2046
2047                 # Extract uploader
2048                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2049                 if mobj is None:
2050                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2051                         return
2052                 video_uploader = mobj.group(1).decode('utf-8')
2053
2054                 # Extract video thumbnail
2055                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2056                 if mobj is None:
2057                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2058                         return
2059                 video_thumbnail = mobj.group(1).decode('utf-8')
2060
2061                 # # Extract video description
2062                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2063                 # if mobj is None:
2064                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2065                 #       return
2066                 # video_description = mobj.group(1).decode('utf-8')
2067                 # if not video_description: video_description = 'No description available.'
2068                 video_description = 'Foo.'
2069
2070                 # Vimeo specific: extract request signature
2071                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2072                 if mobj is None:
2073                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2074                         return
2075                 sig = mobj.group(1).decode('utf-8')
2076
2077                 # Vimeo specific: extract video quality information
2078                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2079                 if mobj is None:
2080                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2081                         return
2082                 quality = mobj.group(1).decode('utf-8')
2083
2084                 if int(quality) == 1:
2085                         quality = 'hd'
2086                 else:
2087                         quality = 'sd'
2088
2089                 # Vimeo specific: Extract request signature expiration
2090                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2091                 if mobj is None:
2092                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2093                         return
2094                 sig_exp = mobj.group(1).decode('utf-8')
2095
2096                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2097
2098                 try:
2099                         # Process video information
2100                         self._downloader.process_info({
2101                                 'id':           video_id.decode('utf-8'),
2102                                 'url':          video_url,
2103                                 'uploader':     video_uploader,
2104                                 'upload_date':  u'NA',
2105                                 'title':        video_title,
2106                                 'stitle':       simple_title,
2107                                 'ext':          u'mp4',
2108                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2109                                 'description':  video_description,
2110                                 'thumbnail':    video_thumbnail,
2111                                 'description':  video_description,
2112                                 'player_url':   None,
2113                         })
2114                 except UnavailableVideoError:
2115                         self._downloader.trouble(u'ERROR: unable to download video')
2116
2117
2118 class GenericIE(InfoExtractor):
2119         """Generic last-resort information extractor."""
2120
2121         _VALID_URL = r'.*'
2122         IE_NAME = u'generic'
2123
2124         def __init__(self, downloader=None):
2125                 InfoExtractor.__init__(self, downloader)
2126
2127         def report_download_webpage(self, video_id):
2128                 """Report webpage download."""
2129                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2130                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2131
2132         def report_extraction(self, video_id):
2133                 """Report information extraction."""
2134                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2135
2136         def _real_extract(self, url):
2137                 # At this point we have a new video
2138                 self._downloader.increment_downloads()
2139
2140                 video_id = url.split('/')[-1]
2141                 request = urllib2.Request(url)
2142                 try:
2143                         self.report_download_webpage(video_id)
2144                         webpage = urllib2.urlopen(request).read()
2145                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2146                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2147                         return
2148                 except ValueError, err:
2149                         # since this is the last-resort InfoExtractor, if
2150                         # this error is thrown, it'll be thrown here
2151                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2152                         return
2153
2154                 self.report_extraction(video_id)
2155                 # Start with something easy: JW Player in SWFObject
2156                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2157                 if mobj is None:
2158                         # Broaden the search a little bit
2159                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2160                 if mobj is None:
2161                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2162                         return
2163
2164                 # It's possible that one of the regexes
2165                 # matched, but returned an empty group:
2166                 if mobj.group(1) is None:
2167                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2168                         return
2169
2170                 video_url = urllib.unquote(mobj.group(1))
2171                 video_id = os.path.basename(video_url)
2172
2173                 # here's a fun little line of code for you:
2174                 video_extension = os.path.splitext(video_id)[1][1:]
2175                 video_id = os.path.splitext(video_id)[0]
2176
2177                 # it's tempting to parse this further, but you would
2178                 # have to take into account all the variations like
2179                 #   Video Title - Site Name
2180                 #   Site Name | Video Title
2181                 #   Video Title - Tagline | Site Name
2182                 # and so on and so forth; it's just not practical
2183                 mobj = re.search(r'<title>(.*)</title>', webpage)
2184                 if mobj is None:
2185                         self._downloader.trouble(u'ERROR: unable to extract title')
2186                         return
2187                 video_title = mobj.group(1).decode('utf-8')
2188                 video_title = sanitize_title(video_title)
2189                 simple_title = _simplify_title(video_title)
2190
2191                 # video uploader is domain name
2192                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2193                 if mobj is None:
2194                         self._downloader.trouble(u'ERROR: unable to extract title')
2195                         return
2196                 video_uploader = mobj.group(1).decode('utf-8')
2197
2198                 try:
2199                         # Process video information
2200                         self._downloader.process_info({
2201                                 'id':           video_id.decode('utf-8'),
2202                                 'url':          video_url.decode('utf-8'),
2203                                 'uploader':     video_uploader,
2204                                 'upload_date':  u'NA',
2205                                 'title':        video_title,
2206                                 'stitle':       simple_title,
2207                                 'ext':          video_extension.decode('utf-8'),
2208                                 'format':       u'NA',
2209                                 'player_url':   None,
2210                         })
2211                 except UnavailableVideoError, err:
2212                         self._downloader.trouble(u'\nERROR: unable to download video')
2213
2214
2215 class YoutubeSearchIE(InfoExtractor):
2216         """Information Extractor for YouTube search queries."""
2217         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2218         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2219         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2220         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2221         _youtube_ie = None
2222         _max_youtube_results = 1000
2223         IE_NAME = u'youtube:search'
2224
2225         def __init__(self, youtube_ie, downloader=None):
2226                 InfoExtractor.__init__(self, downloader)
2227                 self._youtube_ie = youtube_ie
2228
2229         def report_download_page(self, query, pagenum):
2230                 """Report attempt to download playlist page with given number."""
2231                 query = query.decode(preferredencoding())
2232                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2233
2234         def _real_initialize(self):
2235                 self._youtube_ie.initialize()
2236
2237         def _real_extract(self, query):
2238                 mobj = re.match(self._VALID_URL, query)
2239                 if mobj is None:
2240                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2241                         return
2242
2243                 prefix, query = query.split(':')
2244                 prefix = prefix[8:]
2245                 query = query.encode('utf-8')
2246                 if prefix == '':
2247                         self._download_n_results(query, 1)
2248                         return
2249                 elif prefix == 'all':
2250                         self._download_n_results(query, self._max_youtube_results)
2251                         return
2252                 else:
2253                         try:
2254                                 n = long(prefix)
2255                                 if n <= 0:
2256                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2257                                         return
2258                                 elif n > self._max_youtube_results:
2259                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2260                                         n = self._max_youtube_results
2261                                 self._download_n_results(query, n)
2262                                 return
2263                         except ValueError: # parsing prefix as integer fails
2264                                 self._download_n_results(query, 1)
2265                                 return
2266
2267         def _download_n_results(self, query, n):
2268                 """Downloads a specified number of results for a query"""
2269
2270                 video_ids = []
2271                 already_seen = set()
2272                 pagenum = 1
2273
2274                 while True:
2275                         self.report_download_page(query, pagenum)
2276                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2277                         request = urllib2.Request(result_url)
2278                         try:
2279                                 page = urllib2.urlopen(request).read()
2280                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2281                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2282                                 return
2283
2284                         # Extract video identifiers
2285                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2286                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2287                                 if video_id not in already_seen:
2288                                         video_ids.append(video_id)
2289                                         already_seen.add(video_id)
2290                                         if len(video_ids) == n:
2291                                                 # Specified n videos reached
2292                                                 for id in video_ids:
2293                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2294                                                 return
2295
2296                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2297                                 for id in video_ids:
2298                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2299                                 return
2300
2301                         pagenum = pagenum + 1
2302
2303
2304 class GoogleSearchIE(InfoExtractor):
2305         """Information Extractor for Google Video search queries."""
2306         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2307         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2308         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2309         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2310         _google_ie = None
2311         _max_google_results = 1000
2312         IE_NAME = u'video.google:search'
2313
2314         def __init__(self, google_ie, downloader=None):
2315                 InfoExtractor.__init__(self, downloader)
2316                 self._google_ie = google_ie
2317
2318         def report_download_page(self, query, pagenum):
2319                 """Report attempt to download playlist page with given number."""
2320                 query = query.decode(preferredencoding())
2321                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2322
2323         def _real_initialize(self):
2324                 self._google_ie.initialize()
2325
2326         def _real_extract(self, query):
2327                 mobj = re.match(self._VALID_URL, query)
2328                 if mobj is None:
2329                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2330                         return
2331
2332                 prefix, query = query.split(':')
2333                 prefix = prefix[8:]
2334                 query = query.encode('utf-8')
2335                 if prefix == '':
2336                         self._download_n_results(query, 1)
2337                         return
2338                 elif prefix == 'all':
2339                         self._download_n_results(query, self._max_google_results)
2340                         return
2341                 else:
2342                         try:
2343                                 n = long(prefix)
2344                                 if n <= 0:
2345                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2346                                         return
2347                                 elif n > self._max_google_results:
2348                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2349                                         n = self._max_google_results
2350                                 self._download_n_results(query, n)
2351                                 return
2352                         except ValueError: # parsing prefix as integer fails
2353                                 self._download_n_results(query, 1)
2354                                 return
2355
2356         def _download_n_results(self, query, n):
2357                 """Downloads a specified number of results for a query"""
2358
2359                 video_ids = []
2360                 already_seen = set()
2361                 pagenum = 1
2362
2363                 while True:
2364                         self.report_download_page(query, pagenum)
2365                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2366                         request = urllib2.Request(result_url)
2367                         try:
2368                                 page = urllib2.urlopen(request).read()
2369                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2370                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2371                                 return
2372
2373                         # Extract video identifiers
2374                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2375                                 video_id = mobj.group(1)
2376                                 if video_id not in already_seen:
2377                                         video_ids.append(video_id)
2378                                         already_seen.add(video_id)
2379                                         if len(video_ids) == n:
2380                                                 # Specified n videos reached
2381                                                 for id in video_ids:
2382                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2383                                                 return
2384
2385                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2386                                 for id in video_ids:
2387                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2388                                 return
2389
2390                         pagenum = pagenum + 1
2391
2392
2393 class YahooSearchIE(InfoExtractor):
2394         """Information Extractor for Yahoo! Video search queries."""
2395         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2396         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2397         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2398         _MORE_PAGES_INDICATOR = r'\s*Next'
2399         _yahoo_ie = None
2400         _max_yahoo_results = 1000
2401         IE_NAME = u'video.yahoo:search'
2402
2403         def __init__(self, yahoo_ie, downloader=None):
2404                 InfoExtractor.__init__(self, downloader)
2405                 self._yahoo_ie = yahoo_ie
2406
2407         def report_download_page(self, query, pagenum):
2408                 """Report attempt to download playlist page with given number."""
2409                 query = query.decode(preferredencoding())
2410                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2411
2412         def _real_initialize(self):
2413                 self._yahoo_ie.initialize()
2414
2415         def _real_extract(self, query):
2416                 mobj = re.match(self._VALID_URL, query)
2417                 if mobj is None:
2418                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2419                         return
2420
2421                 prefix, query = query.split(':')
2422                 prefix = prefix[8:]
2423                 query = query.encode('utf-8')
2424                 if prefix == '':
2425                         self._download_n_results(query, 1)
2426                         return
2427                 elif prefix == 'all':
2428                         self._download_n_results(query, self._max_yahoo_results)
2429                         return
2430                 else:
2431                         try:
2432                                 n = long(prefix)
2433                                 if n <= 0:
2434                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2435                                         return
2436                                 elif n > self._max_yahoo_results:
2437                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2438                                         n = self._max_yahoo_results
2439                                 self._download_n_results(query, n)
2440                                 return
2441                         except ValueError: # parsing prefix as integer fails
2442                                 self._download_n_results(query, 1)
2443                                 return
2444
2445         def _download_n_results(self, query, n):
2446                 """Downloads a specified number of results for a query"""
2447
2448                 video_ids = []
2449                 already_seen = set()
2450                 pagenum = 1
2451
2452                 while True:
2453                         self.report_download_page(query, pagenum)
2454                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2455                         request = urllib2.Request(result_url)
2456                         try:
2457                                 page = urllib2.urlopen(request).read()
2458                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2459                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2460                                 return
2461
2462                         # Extract video identifiers
2463                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2464                                 video_id = mobj.group(1)
2465                                 if video_id not in already_seen:
2466                                         video_ids.append(video_id)
2467                                         already_seen.add(video_id)
2468                                         if len(video_ids) == n:
2469                                                 # Specified n videos reached
2470                                                 for id in video_ids:
2471                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2472                                                 return
2473
2474                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2475                                 for id in video_ids:
2476                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2477                                 return
2478
2479                         pagenum = pagenum + 1
2480
2481
2482 class YoutubePlaylistIE(InfoExtractor):
2483         """Information Extractor for YouTube playlists."""
2484
2485         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2486         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2487         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2488         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2489         _youtube_ie = None
2490         IE_NAME = u'youtube:playlist'
2491
2492         def __init__(self, youtube_ie, downloader=None):
2493                 InfoExtractor.__init__(self, downloader)
2494                 self._youtube_ie = youtube_ie
2495
2496         def report_download_page(self, playlist_id, pagenum):
2497                 """Report attempt to download playlist page with given number."""
2498                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2499
2500         def _real_initialize(self):
2501                 self._youtube_ie.initialize()
2502
2503         def _real_extract(self, url):
2504                 # Extract playlist id
2505                 mobj = re.match(self._VALID_URL, url)
2506                 if mobj is None:
2507                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2508                         return
2509
2510                 # Single video case
2511                 if mobj.group(3) is not None:
2512                         self._youtube_ie.extract(mobj.group(3))
2513                         return
2514
2515                 # Download playlist pages
2516                 # prefix is 'p' as default for playlists but there are other types that need extra care
2517                 playlist_prefix = mobj.group(1)
2518                 if playlist_prefix == 'a':
2519                         playlist_access = 'artist'
2520                 else:
2521                         playlist_prefix = 'p'
2522                         playlist_access = 'view_play_list'
2523                 playlist_id = mobj.group(2)
2524                 video_ids = []
2525                 pagenum = 1
2526
2527                 while True:
2528                         self.report_download_page(playlist_id, pagenum)
2529                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2530                         request = urllib2.Request(url)
2531                         try:
2532                                 page = urllib2.urlopen(request).read()
2533                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2534                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2535                                 return
2536
2537                         # Extract video identifiers
2538                         ids_in_page = []
2539                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2540                                 if mobj.group(1) not in ids_in_page:
2541                                         ids_in_page.append(mobj.group(1))
2542                         video_ids.extend(ids_in_page)
2543
2544                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2545                                 break
2546                         pagenum = pagenum + 1
2547
2548                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2549                 playlistend = self._downloader.params.get('playlistend', -1)
2550                 video_ids = video_ids[playliststart:playlistend]
2551
2552                 for id in video_ids:
2553                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2554                 return
2555
2556
2557 class YoutubeUserIE(InfoExtractor):
2558         """Information Extractor for YouTube users."""
2559
2560         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2561         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2562         _GDATA_PAGE_SIZE = 50
2563         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2564         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2565         _youtube_ie = None
2566         IE_NAME = u'youtube:user'
2567
2568         def __init__(self, youtube_ie, downloader=None):
2569                 InfoExtractor.__init__(self, downloader)
2570                 self._youtube_ie = youtube_ie
2571
2572         def report_download_page(self, username, start_index):
2573                 """Report attempt to download user page."""
2574                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2575                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2576
2577         def _real_initialize(self):
2578                 self._youtube_ie.initialize()
2579
2580         def _real_extract(self, url):
2581                 # Extract username
2582                 mobj = re.match(self._VALID_URL, url)
2583                 if mobj is None:
2584                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2585                         return
2586
2587                 username = mobj.group(1)
2588
2589                 # Download video ids using YouTube Data API. Result size per
2590                 # query is limited (currently to 50 videos) so we need to query
2591                 # page by page until there are no video ids - it means we got
2592                 # all of them.
2593
2594                 video_ids = []
2595                 pagenum = 0
2596
2597                 while True:
2598                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2599                         self.report_download_page(username, start_index)
2600
2601                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2602
2603                         try:
2604                                 page = urllib2.urlopen(request).read()
2605                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2606                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2607                                 return
2608
2609                         # Extract video identifiers
2610                         ids_in_page = []
2611
2612                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2613                                 if mobj.group(1) not in ids_in_page:
2614                                         ids_in_page.append(mobj.group(1))
2615
2616                         video_ids.extend(ids_in_page)
2617
2618                         # A little optimization - if current page is not
2619                         # "full", ie. does not contain PAGE_SIZE video ids then
2620                         # we can assume that this page is the last one - there
2621                         # are no more ids on further pages - no need to query
2622                         # again.
2623
2624                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2625                                 break
2626
2627                         pagenum += 1
2628
2629                 all_ids_count = len(video_ids)
2630                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2631                 playlistend = self._downloader.params.get('playlistend', -1)
2632
2633                 if playlistend == -1:
2634                         video_ids = video_ids[playliststart:]
2635                 else:
2636                         video_ids = video_ids[playliststart:playlistend]
2637
2638                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2639                                 (username, all_ids_count, len(video_ids)))
2640
2641                 for video_id in video_ids:
2642                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2643
2644
2645 class DepositFilesIE(InfoExtractor):
2646         """Information extractor for depositfiles.com"""
2647
2648         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2649         IE_NAME = u'DepositFiles'
2650
2651         def __init__(self, downloader=None):
2652                 InfoExtractor.__init__(self, downloader)
2653
2654         def report_download_webpage(self, file_id):
2655                 """Report webpage download."""
2656                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2657
2658         def report_extraction(self, file_id):
2659                 """Report information extraction."""
2660                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2661
2662         def _real_extract(self, url):
2663                 # At this point we have a new file
2664                 self._downloader.increment_downloads()
2665
2666                 file_id = url.split('/')[-1]
2667                 # Rebuild url in english locale
2668                 url = 'http://depositfiles.com/en/files/' + file_id
2669
2670                 # Retrieve file webpage with 'Free download' button pressed
2671                 free_download_indication = { 'gateway_result' : '1' }
2672                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2673                 try:
2674                         self.report_download_webpage(file_id)
2675                         webpage = urllib2.urlopen(request).read()
2676                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2677                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2678                         return
2679
2680                 # Search for the real file URL
2681                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2682                 if (mobj is None) or (mobj.group(1) is None):
2683                         # Try to figure out reason of the error.
2684                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2685                         if (mobj is not None) and (mobj.group(1) is not None):
2686                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2687                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2688                         else:
2689                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2690                         return
2691
2692                 file_url = mobj.group(1)
2693                 file_extension = os.path.splitext(file_url)[1][1:]
2694
2695                 # Search for file title
2696                 mobj = re.search(r'<b title="(.*?)">', webpage)
2697                 if mobj is None:
2698                         self._downloader.trouble(u'ERROR: unable to extract title')
2699                         return
2700                 file_title = mobj.group(1).decode('utf-8')
2701
2702                 try:
2703                         # Process file information
2704                         self._downloader.process_info({
2705                                 'id':           file_id.decode('utf-8'),
2706                                 'url':          file_url.decode('utf-8'),
2707                                 'uploader':     u'NA',
2708                                 'upload_date':  u'NA',
2709                                 'title':        file_title,
2710                                 'stitle':       file_title,
2711                                 'ext':          file_extension.decode('utf-8'),
2712                                 'format':       u'NA',
2713                                 'player_url':   None,
2714                         })
2715                 except UnavailableVideoError, err:
2716                         self._downloader.trouble(u'ERROR: unable to download file')
2717
2718
2719 class FacebookIE(InfoExtractor):
2720         """Information Extractor for Facebook"""
2721
2722         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2723         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2724         _NETRC_MACHINE = 'facebook'
2725         _available_formats = ['video', 'highqual', 'lowqual']
2726         _video_extensions = {
2727                 'video': 'mp4',
2728                 'highqual': 'mp4',
2729                 'lowqual': 'mp4',
2730         }
2731         IE_NAME = u'facebook'
2732
2733         def __init__(self, downloader=None):
2734                 InfoExtractor.__init__(self, downloader)
2735
2736         def _reporter(self, message):
2737                 """Add header and report message."""
2738                 self._downloader.to_screen(u'[facebook] %s' % message)
2739
2740         def report_login(self):
2741                 """Report attempt to log in."""
2742                 self._reporter(u'Logging in')
2743
2744         def report_video_webpage_download(self, video_id):
2745                 """Report attempt to download video webpage."""
2746                 self._reporter(u'%s: Downloading video webpage' % video_id)
2747
2748         def report_information_extraction(self, video_id):
2749                 """Report attempt to extract video information."""
2750                 self._reporter(u'%s: Extracting video information' % video_id)
2751
2752         def _parse_page(self, video_webpage):
2753                 """Extract video information from page"""
2754                 # General data
2755                 data = {'title': r'\("video_title", "(.*?)"\)',
2756                         'description': r'<div class="datawrap">(.*?)</div>',
2757                         'owner': r'\("video_owner_name", "(.*?)"\)',
2758                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2759                         }
2760                 video_info = {}
2761                 for piece in data.keys():
2762                         mobj = re.search(data[piece], video_webpage)
2763                         if mobj is not None:
2764                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2765
2766                 # Video urls
2767                 video_urls = {}
2768                 for fmt in self._available_formats:
2769                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2770                         if mobj is not None:
2771                                 # URL is in a Javascript segment inside an escaped Unicode format within
2772                                 # the generally utf-8 page
2773                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2774                 video_info['video_urls'] = video_urls
2775
2776                 return video_info
2777
2778         def _real_initialize(self):
2779                 if self._downloader is None:
2780                         return
2781
2782                 useremail = None
2783                 password = None
2784                 downloader_params = self._downloader.params
2785
2786                 # Attempt to use provided username and password or .netrc data
2787                 if downloader_params.get('username', None) is not None:
2788                         useremail = downloader_params['username']
2789                         password = downloader_params['password']
2790                 elif downloader_params.get('usenetrc', False):
2791                         try:
2792                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2793                                 if info is not None:
2794                                         useremail = info[0]
2795                                         password = info[2]
2796                                 else:
2797                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2798                         except (IOError, netrc.NetrcParseError), err:
2799                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2800                                 return
2801
2802                 if useremail is None:
2803                         return
2804
2805                 # Log in
2806                 login_form = {
2807                         'email': useremail,
2808                         'pass': password,
2809                         'login': 'Log+In'
2810                         }
2811                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2812                 try:
2813                         self.report_login()
2814                         login_results = urllib2.urlopen(request).read()
2815                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2816                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2817                                 return
2818                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2819                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2820                         return
2821
2822         def _real_extract(self, url):
2823                 mobj = re.match(self._VALID_URL, url)
2824                 if mobj is None:
2825                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2826                         return
2827                 video_id = mobj.group('ID')
2828
2829                 # Get video webpage
2830                 self.report_video_webpage_download(video_id)
2831                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2832                 try:
2833                         page = urllib2.urlopen(request)
2834                         video_webpage = page.read()
2835                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2836                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2837                         return
2838
2839                 # Start extracting information
2840                 self.report_information_extraction(video_id)
2841
2842                 # Extract information
2843                 video_info = self._parse_page(video_webpage)
2844
2845                 # uploader
2846                 if 'owner' not in video_info:
2847                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2848                         return
2849                 video_uploader = video_info['owner']
2850
2851                 # title
2852                 if 'title' not in video_info:
2853                         self._downloader.trouble(u'ERROR: unable to extract video title')
2854                         return
2855                 video_title = video_info['title']
2856                 video_title = video_title.decode('utf-8')
2857                 video_title = sanitize_title(video_title)
2858
2859                 simple_title = _simplify_title(video_title)
2860
2861                 # thumbnail image
2862                 if 'thumbnail' not in video_info:
2863                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2864                         video_thumbnail = ''
2865                 else:
2866                         video_thumbnail = video_info['thumbnail']
2867
2868                 # upload date
2869                 upload_date = u'NA'
2870                 if 'upload_date' in video_info:
2871                         upload_time = video_info['upload_date']
2872                         timetuple = email.utils.parsedate_tz(upload_time)
2873                         if timetuple is not None:
2874                                 try:
2875                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2876                                 except:
2877                                         pass
2878
2879                 # description
2880                 video_description = video_info.get('description', 'No description available.')
2881
2882                 url_map = video_info['video_urls']
2883                 if len(url_map.keys()) > 0:
2884                         # Decide which formats to download
2885                         req_format = self._downloader.params.get('format', None)
2886                         format_limit = self._downloader.params.get('format_limit', None)
2887
2888                         if format_limit is not None and format_limit in self._available_formats:
2889                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2890                         else:
2891                                 format_list = self._available_formats
2892                         existing_formats = [x for x in format_list if x in url_map]
2893                         if len(existing_formats) == 0:
2894                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2895                                 return
2896                         if req_format is None:
2897                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2898                         elif req_format == 'worst':
2899                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2900                         elif req_format == '-1':
2901                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2902                         else:
2903                                 # Specific format
2904                                 if req_format not in url_map:
2905                                         self._downloader.trouble(u'ERROR: requested format not available')
2906                                         return
2907                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2908
2909                 for format_param, video_real_url in video_url_list:
2910
2911                         # At this point we have a new video
2912                         self._downloader.increment_downloads()
2913
2914                         # Extension
2915                         video_extension = self._video_extensions.get(format_param, 'mp4')
2916
2917                         try:
2918                                 # Process video information
2919                                 self._downloader.process_info({
2920                                         'id':           video_id.decode('utf-8'),
2921                                         'url':          video_real_url.decode('utf-8'),
2922                                         'uploader':     video_uploader.decode('utf-8'),
2923                                         'upload_date':  upload_date,
2924                                         'title':        video_title,
2925                                         'stitle':       simple_title,
2926                                         'ext':          video_extension.decode('utf-8'),
2927                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2928                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2929                                         'description':  video_description.decode('utf-8'),
2930                                         'player_url':   None,
2931                                 })
2932                         except UnavailableVideoError, err:
2933                                 self._downloader.trouble(u'\nERROR: unable to download video')
2934
2935 class BlipTVIE(InfoExtractor):
2936         """Information extractor for blip.tv"""
2937
2938         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2939         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2940         IE_NAME = u'blip.tv'
2941
2942         def report_extraction(self, file_id):
2943                 """Report information extraction."""
2944                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2945
2946         def report_direct_download(self, title):
2947                 """Report information extraction."""
2948                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2949
2950         def _real_extract(self, url):
2951                 mobj = re.match(self._VALID_URL, url)
2952                 if mobj is None:
2953                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2954                         return
2955
2956                 if '?' in url:
2957                         cchar = '&'
2958                 else:
2959                         cchar = '?'
2960                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2961                 request = urllib2.Request(json_url)
2962                 self.report_extraction(mobj.group(1))
2963                 info = None
2964                 try:
2965                         urlh = urllib2.urlopen(request)
2966                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2967                                 basename = url.split('/')[-1]
2968                                 title,ext = os.path.splitext(basename)
2969                                 title = title.decode('UTF-8')
2970                                 ext = ext.replace('.', '')
2971                                 self.report_direct_download(title)
2972                                 info = {
2973                                         'id': title,
2974                                         'url': url,
2975                                         'title': title,
2976                                         'stitle': _simplify_title(title),
2977                                         'ext': ext,
2978                                         'urlhandle': urlh
2979                                 }
2980                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2981                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2982                         return
2983                 if info is None: # Regular URL
2984                         try:
2985                                 json_code = urlh.read()
2986                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2987                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2988                                 return
2989
2990                         try:
2991                                 json_data = json.loads(json_code)
2992                                 if 'Post' in json_data:
2993                                         data = json_data['Post']
2994                                 else:
2995                                         data = json_data
2996         
2997                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2998                                 video_url = data['media']['url']
2999                                 umobj = re.match(self._URL_EXT, video_url)
3000                                 if umobj is None:
3001                                         raise ValueError('Can not determine filename extension')
3002                                 ext = umobj.group(1)
3003         
3004                                 info = {
3005                                         'id': data['item_id'],
3006                                         'url': video_url,
3007                                         'uploader': data['display_name'],
3008                                         'upload_date': upload_date,
3009                                         'title': data['title'],
3010                                         'stitle': _simplify_title(data['title']),
3011                                         'ext': ext,
3012                                         'format': data['media']['mimeType'],
3013                                         'thumbnail': data['thumbnailUrl'],
3014                                         'description': data['description'],
3015                                         'player_url': data['embedUrl']
3016                                 }
3017                         except (ValueError,KeyError), err:
3018                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3019                                 return
3020
3021                 self._downloader.increment_downloads()
3022
3023                 try:
3024                         self._downloader.process_info(info)
3025                 except UnavailableVideoError, err:
3026                         self._downloader.trouble(u'\nERROR: unable to download video')
3027
3028
3029 class MyVideoIE(InfoExtractor):
3030         """Information Extractor for myvideo.de."""
3031
3032         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3033         IE_NAME = u'myvideo'
3034
3035         def __init__(self, downloader=None):
3036                 InfoExtractor.__init__(self, downloader)
3037         
3038         def report_download_webpage(self, video_id):
3039                 """Report webpage download."""
3040                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3041
3042         def report_extraction(self, video_id):
3043                 """Report information extraction."""
3044                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3045
3046         def _real_extract(self,url):
3047                 mobj = re.match(self._VALID_URL, url)
3048                 if mobj is None:
3049                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3050                         return
3051
3052                 video_id = mobj.group(1)
3053
3054                 # Get video webpage
3055                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3056                 try:
3057                         self.report_download_webpage(video_id)
3058                         webpage = urllib2.urlopen(request).read()
3059                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3060                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3061                         return
3062
3063                 self.report_extraction(video_id)
3064                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3065                                  webpage)
3066                 if mobj is None:
3067                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3068                         return
3069                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3070
3071                 mobj = re.search('<title>([^<]+)</title>', webpage)
3072                 if mobj is None:
3073                         self._downloader.trouble(u'ERROR: unable to extract title')
3074                         return
3075
3076                 video_title = mobj.group(1)
3077                 video_title = sanitize_title(video_title)
3078
3079                 simple_title = _simplify_title(video_title)
3080
3081                 try:
3082                         self._downloader.process_info({
3083                                 'id':           video_id,
3084                                 'url':          video_url,
3085                                 'uploader':     u'NA',
3086                                 'upload_date':  u'NA',
3087                                 'title':        video_title,
3088                                 'stitle':       simple_title,
3089                                 'ext':          u'flv',
3090                                 'format':       u'NA',
3091                                 'player_url':   None,
3092                         })
3093                 except UnavailableVideoError:
3094                         self._downloader.trouble(u'\nERROR: Unable to download video')
3095
3096 class ComedyCentralIE(InfoExtractor):
3097         """Information extractor for The Daily Show and Colbert Report """
3098
3099         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3100         IE_NAME = u'comedycentral'
3101
3102         def report_extraction(self, episode_id):
3103                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3104         
3105         def report_config_download(self, episode_id):
3106                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3107
3108         def report_index_download(self, episode_id):
3109                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3110
3111         def report_player_url(self, episode_id):
3112                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3113
3114         def _real_extract(self, url):
3115                 mobj = re.match(self._VALID_URL, url)
3116                 if mobj is None:
3117                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3118                         return
3119
3120                 if mobj.group('shortname'):
3121                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3122                                 url = u'http://www.thedailyshow.com/full-episodes/'
3123                         else:
3124                                 url = u'http://www.colbertnation.com/full-episodes/'
3125                         mobj = re.match(self._VALID_URL, url)
3126                         assert mobj is not None
3127
3128                 dlNewest = not mobj.group('episode')
3129                 if dlNewest:
3130                         epTitle = mobj.group('showname')
3131                 else:
3132                         epTitle = mobj.group('episode')
3133
3134                 req = urllib2.Request(url)
3135                 self.report_extraction(epTitle)
3136                 try:
3137                         htmlHandle = urllib2.urlopen(req)
3138                         html = htmlHandle.read()
3139                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3140                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3141                         return
3142                 if dlNewest:
3143                         url = htmlHandle.geturl()
3144                         mobj = re.match(self._VALID_URL, url)
3145                         if mobj is None:
3146                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3147                                 return
3148                         if mobj.group('episode') == '':
3149                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3150                                 return
3151                         epTitle = mobj.group('episode')
3152
3153                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3154                 if len(mMovieParams) == 0:
3155                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3156                         return
3157
3158                 playerUrl_raw = mMovieParams[0][0]
3159                 self.report_player_url(epTitle)
3160                 try:
3161                         urlHandle = urllib2.urlopen(playerUrl_raw)
3162                         playerUrl = urlHandle.geturl()
3163                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3164                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3165                         return
3166
3167                 uri = mMovieParams[0][1]
3168                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3169                 self.report_index_download(epTitle)
3170                 try:
3171                         indexXml = urllib2.urlopen(indexUrl).read()
3172                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3173                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3174                         return
3175
3176                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3177                 itemEls = idoc.findall('.//item')
3178                 for itemEl in itemEls:
3179                         mediaId = itemEl.findall('./guid')[0].text
3180                         shortMediaId = mediaId.split(':')[-1]
3181                         showId = mediaId.split(':')[-2].replace('.com', '')
3182                         officialTitle = itemEl.findall('./title')[0].text
3183                         officialDate = itemEl.findall('./pubDate')[0].text
3184
3185                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3186                                                 urllib.urlencode({'uri': mediaId}))
3187                         configReq = urllib2.Request(configUrl)
3188                         self.report_config_download(epTitle)
3189                         try:
3190                                 configXml = urllib2.urlopen(configReq).read()
3191                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3192                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3193                                 return
3194
3195                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3196                         turls = []
3197                         for rendition in cdoc.findall('.//rendition'):
3198                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3199                                 turls.append(finfo)
3200
3201                         if len(turls) == 0:
3202                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3203                                 continue
3204
3205                         # For now, just pick the highest bitrate
3206                         format,video_url = turls[-1]
3207
3208                         self._downloader.increment_downloads()
3209
3210                         effTitle = showId + u'-' + epTitle
3211                         info = {
3212                                 'id': shortMediaId,
3213                                 'url': video_url,
3214                                 'uploader': showId,
3215                                 'upload_date': officialDate,
3216                                 'title': effTitle,
3217                                 'stitle': _simplify_title(effTitle),
3218                                 'ext': 'mp4',
3219                                 'format': format,
3220                                 'thumbnail': None,
3221                                 'description': officialTitle,
3222                                 'player_url': playerUrl
3223                         }
3224
3225                         try:
3226                                 self._downloader.process_info(info)
3227                         except UnavailableVideoError, err:
3228                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3229                                 continue
3230
3231
3232 class EscapistIE(InfoExtractor):
3233         """Information extractor for The Escapist """
3234
3235         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3236         IE_NAME = u'escapist'
3237
3238         def report_extraction(self, showName):
3239                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3240
3241         def report_config_download(self, showName):
3242                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3243
3244         def _real_extract(self, url):
3245                 htmlParser = HTMLParser.HTMLParser()
3246
3247                 mobj = re.match(self._VALID_URL, url)
3248                 if mobj is None:
3249                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3250                         return
3251                 showName = mobj.group('showname')
3252                 videoId = mobj.group('episode')
3253
3254                 self.report_extraction(showName)
3255                 try:
3256                         webPage = urllib2.urlopen(url).read()
3257                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3258                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3259                         return
3260
3261                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3262                 description = htmlParser.unescape(descMatch.group(1))
3263                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3264                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3265                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3266                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3267                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3268                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3269
3270                 self.report_config_download(showName)
3271                 try:
3272                         configJSON = urllib2.urlopen(configUrl).read()
3273                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3274                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3275                         return
3276
3277                 # Technically, it's JavaScript, not JSON
3278                 configJSON = configJSON.replace("'", '"')
3279
3280                 try:
3281                         config = json.loads(configJSON)
3282                 except (ValueError,), err:
3283                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3284                         return
3285
3286                 playlist = config['playlist']
3287                 videoUrl = playlist[1]['url']
3288
3289                 self._downloader.increment_downloads()
3290                 info = {
3291                         'id': videoId,
3292                         'url': videoUrl,
3293                         'uploader': showName,
3294                         'upload_date': None,
3295                         'title': showName,
3296                         'stitle': _simplify_title(showName),
3297                         'ext': 'flv',
3298                         'format': 'flv',
3299                         'thumbnail': imgUrl,
3300                         'description': description,
3301                         'player_url': playerUrl,
3302                 }
3303
3304                 try:
3305                         self._downloader.process_info(info)
3306                 except UnavailableVideoError, err:
3307                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3308
3309
3310 class CollegeHumorIE(InfoExtractor):
3311         """Information extractor for collegehumor.com"""
3312
3313         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3314         IE_NAME = u'collegehumor'
3315
3316         def report_webpage(self, video_id):
3317                 """Report information extraction."""
3318                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3319
3320         def report_extraction(self, video_id):
3321                 """Report information extraction."""
3322                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3323
3324         def _real_extract(self, url):
3325                 htmlParser = HTMLParser.HTMLParser()
3326
3327                 mobj = re.match(self._VALID_URL, url)
3328                 if mobj is None:
3329                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3330                         return
3331                 video_id = mobj.group('videoid')
3332
3333                 self.report_webpage(video_id)
3334                 request = urllib2.Request(url)
3335                 try:
3336                         webpage = urllib2.urlopen(request).read()
3337                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3338                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3339                         return
3340
3341                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3342                 if m is None:
3343                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3344                         return
3345                 internal_video_id = m.group('internalvideoid')
3346
3347                 info = {
3348                         'id': video_id,
3349                         'internal_id': internal_video_id,
3350                 }
3351
3352                 self.report_extraction(video_id)
3353                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3354                 try:
3355                         metaXml = urllib2.urlopen(xmlUrl).read()
3356                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3357                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3358                         return
3359
3360                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3361                 try:
3362                         videoNode = mdoc.findall('./video')[0]
3363                         info['description'] = videoNode.findall('./description')[0].text
3364                         info['title'] = videoNode.findall('./caption')[0].text
3365                         info['stitle'] = _simplify_title(info['title'])
3366                         info['url'] = videoNode.findall('./file')[0].text
3367                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3368                         info['ext'] = info['url'].rpartition('.')[2]
3369                         info['format'] = info['ext']
3370                 except IndexError:
3371                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3372                         return
3373
3374                 self._downloader.increment_downloads()
3375
3376                 try:
3377                         self._downloader.process_info(info)
3378                 except UnavailableVideoError, err:
3379                         self._downloader.trouble(u'\nERROR: unable to download video')
3380
3381
3382 class XVideosIE(InfoExtractor):
3383         """Information extractor for xvideos.com"""
3384
3385         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3386         IE_NAME = u'xvideos'
3387
3388         def report_webpage(self, video_id):
3389                 """Report information extraction."""
3390                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3391
3392         def report_extraction(self, video_id):
3393                 """Report information extraction."""
3394                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3395
3396         def _real_extract(self, url):
3397                 htmlParser = HTMLParser.HTMLParser()
3398
3399                 mobj = re.match(self._VALID_URL, url)
3400                 if mobj is None:
3401                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3402                         return
3403                 video_id = mobj.group(1).decode('utf-8')
3404
3405                 self.report_webpage(video_id)
3406
3407                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3408                 try:
3409                         webpage = urllib2.urlopen(request).read()
3410                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3411                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3412                         return
3413
3414                 self.report_extraction(video_id)
3415
3416
3417                 # Extract video URL
3418                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3419                 if mobj is None:
3420                         self._downloader.trouble(u'ERROR: unable to extract video url')
3421                         return
3422                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3423
3424
3425                 # Extract title
3426                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3427                 if mobj is None:
3428                         self._downloader.trouble(u'ERROR: unable to extract video title')
3429                         return
3430                 video_title = mobj.group(1).decode('utf-8')
3431
3432
3433                 # Extract video thumbnail
3434                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3435                 if mobj is None:
3436                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3437                         return
3438                 video_thumbnail = mobj.group(1).decode('utf-8')
3439
3440
3441
3442                 self._downloader.increment_downloads()
3443                 info = {
3444                         'id': video_id,
3445                         'url': video_url,
3446                         'uploader': None,
3447                         'upload_date': None,
3448                         'title': video_title,
3449                         'stitle': _simplify_title(video_title),
3450                         'ext': 'flv',
3451                         'format': 'flv',
3452                         'thumbnail': video_thumbnail,
3453                         'description': None,
3454                         'player_url': None,
3455                 }
3456
3457                 try:
3458                         self._downloader.process_info(info)
3459                 except UnavailableVideoError, err:
3460                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3461
3462
3463 class SoundcloudIE(InfoExtractor):
3464         """Information extractor for soundcloud.com
3465            To access the media, the uid of the song and a stream token
3466            must be extracted from the page source and the script must make
3467            a request to media.soundcloud.com/crossdomain.xml. Then
3468            the media can be grabbed by requesting from an url composed
3469            of the stream token and uid
3470          """
3471
3472         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3473         IE_NAME = u'soundcloud'
3474
3475         def __init__(self, downloader=None):
3476                 InfoExtractor.__init__(self, downloader)
3477
3478         def report_webpage(self, video_id):
3479                 """Report information extraction."""
3480                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3481
3482         def report_extraction(self, video_id):
3483                 """Report information extraction."""
3484                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3485
3486         def _real_extract(self, url):
3487                 htmlParser = HTMLParser.HTMLParser()
3488
3489                 mobj = re.match(self._VALID_URL, url)
3490                 if mobj is None:
3491                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3492                         return
3493
3494                 # extract uploader (which is in the url)
3495                 uploader = mobj.group(1).decode('utf-8')
3496                 # extract simple title (uploader + slug of song title)
3497                 slug_title =  mobj.group(2).decode('utf-8')
3498                 simple_title = uploader + '-' + slug_title
3499
3500                 self.report_webpage('%s/%s' % (uploader, slug_title))
3501
3502                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3503                 try:
3504                         webpage = urllib2.urlopen(request).read()
3505                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3506                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3507                         return
3508
3509                 self.report_extraction('%s/%s' % (uploader, slug_title))
3510
3511                 # extract uid and stream token that soundcloud hands out for access
3512                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3513                 if mobj:
3514                         video_id = mobj.group(1)
3515                         stream_token = mobj.group(2)
3516
3517                 # extract unsimplified title
3518                 mobj = re.search('"title":"(.*?)",', webpage)
3519                 if mobj:
3520                         title = mobj.group(1)
3521
3522                 # construct media url (with uid/token)
3523                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3524                 mediaURL = mediaURL % (video_id, stream_token)
3525
3526                 # description
3527                 description = u'No description available'
3528                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3529                 if mobj:
3530                         description = mobj.group(1)
3531                 
3532                 # upload date
3533                 upload_date = None
3534                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3535                 if mobj:
3536                         try:
3537                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3538                         except Exception, e:
3539                                 print str(e)
3540
3541                 # for soundcloud, a request to a cross domain is required for cookies
3542                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3543
3544                 try:
3545                         self._downloader.process_info({
3546                                 'id':           video_id.decode('utf-8'),
3547                                 'url':          mediaURL,
3548                                 'uploader':     uploader.decode('utf-8'),
3549                                 'upload_date':  upload_date,
3550                                 'title':        simple_title.decode('utf-8'),
3551                                 'stitle':       simple_title.decode('utf-8'),
3552                                 'ext':          u'mp3',
3553                                 'format':       u'NA',
3554                                 'player_url':   None,
3555                                 'description': description.decode('utf-8')
3556                         })
3557                 except UnavailableVideoError:
3558                         self._downloader.trouble(u'\nERROR: unable to download video')
3559
3560
3561 class InfoQIE(InfoExtractor):
3562         """Information extractor for infoq.com"""
3563
3564         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3565         IE_NAME = u'infoq'
3566
3567         def report_webpage(self, video_id):
3568                 """Report information extraction."""
3569                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3570
3571         def report_extraction(self, video_id):
3572                 """Report information extraction."""
3573                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3574
3575         def _real_extract(self, url):
3576                 htmlParser = HTMLParser.HTMLParser()
3577
3578                 mobj = re.match(self._VALID_URL, url)
3579                 if mobj is None:
3580                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3581                         return
3582
3583                 self.report_webpage(url)
3584
3585                 request = urllib2.Request(url)
3586                 try:
3587                         webpage = urllib2.urlopen(request).read()
3588                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3589                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3590                         return
3591
3592                 self.report_extraction(url)
3593
3594
3595                 # Extract video URL
3596                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3597                 if mobj is None:
3598                         self._downloader.trouble(u'ERROR: unable to extract video url')
3599                         return
3600                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3601
3602
3603                 # Extract title
3604                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3605                 if mobj is None:
3606                         self._downloader.trouble(u'ERROR: unable to extract video title')
3607                         return
3608                 video_title = mobj.group(1).decode('utf-8')
3609
3610                 # Extract description
3611                 video_description = u'No description available.'
3612                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3613                 if mobj is not None:
3614                         video_description = mobj.group(1).decode('utf-8')
3615
3616                 video_filename = video_url.split('/')[-1]
3617                 video_id, extension = video_filename.split('.')
3618
3619                 self._downloader.increment_downloads()
3620                 info = {
3621                         'id': video_id,
3622                         'url': video_url,
3623                         'uploader': None,
3624                         'upload_date': None,
3625                         'title': video_title,
3626                         'stitle': _simplify_title(video_title),
3627                         'ext': extension,
3628                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3629                         'thumbnail': None,
3630                         'description': video_description,
3631                         'player_url': None,
3632                 }
3633
3634                 try:
3635                         self._downloader.process_info(info)
3636                 except UnavailableVideoError, err:
3637                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3638
3639 class MixcloudIE(InfoExtractor):
3640         """Information extractor for www.mixcloud.com"""
3641         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3642         IE_NAME = u'mixcloud'
3643
3644         def __init__(self, downloader=None):
3645                 InfoExtractor.__init__(self, downloader)
3646
3647         def report_download_json(self, file_id):
3648                 """Report JSON download."""
3649                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3650
3651         def report_extraction(self, file_id):
3652                 """Report information extraction."""
3653                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3654
3655         def get_urls(self, jsonData, fmt, bitrate='best'):
3656                 """Get urls from 'audio_formats' section in json"""
3657                 file_url = None
3658                 try:
3659                         bitrate_list = jsonData[fmt]
3660                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3661                                 bitrate = max(bitrate_list) # select highest
3662
3663                         url_list = jsonData[fmt][bitrate]
3664                 except TypeError: # we have no bitrate info.
3665                         url_list = jsonData[fmt]
3666                                 
3667                 return url_list
3668
3669         def check_urls(self, url_list):
3670                 """Returns 1st active url from list"""
3671                 for url in url_list:
3672                         try:
3673                                 urllib2.urlopen(url)
3674                                 return url
3675                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3676                                 url = None
3677
3678                 return None
3679
3680         def _print_formats(self, formats):
3681                 print 'Available formats:'
3682                 for fmt in formats.keys():
3683                         for b in formats[fmt]:
3684                                 try:
3685                                         ext = formats[fmt][b][0]
3686                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3687                                 except TypeError: # we have no bitrate info
3688                                         ext = formats[fmt][0]
3689                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3690                                         break
3691
3692         def _real_extract(self, url):
3693                 mobj = re.match(self._VALID_URL, url)
3694                 if mobj is None:
3695                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3696                         return
3697                 # extract uploader & filename from url
3698                 uploader = mobj.group(1).decode('utf-8')
3699                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3700
3701                 # construct API request
3702                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3703                 # retrieve .json file with links to files
3704                 request = urllib2.Request(file_url)
3705                 try:
3706                         self.report_download_json(file_url)
3707                         jsonData = urllib2.urlopen(request).read()
3708                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3709                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3710                         return
3711
3712                 # parse JSON
3713                 json_data = json.loads(jsonData)
3714                 player_url = json_data['player_swf_url']
3715                 formats = dict(json_data['audio_formats'])
3716
3717                 req_format = self._downloader.params.get('format', None)
3718                 bitrate = None
3719
3720                 if self._downloader.params.get('listformats', None):
3721                         self._print_formats(formats)
3722                         return
3723
3724                 if req_format is None or req_format == 'best':
3725                         for format_param in formats.keys():
3726                                 url_list = self.get_urls(formats, format_param)
3727                                 # check urls
3728                                 file_url = self.check_urls(url_list)
3729                                 if file_url is not None:
3730                                         break # got it!
3731                 else:
3732                         if req_format not in formats.keys():
3733                                 self._downloader.trouble(u'ERROR: format is not available')
3734                                 return
3735
3736                         url_list = self.get_urls(formats, req_format)
3737                         file_url = self.check_urls(url_list)
3738                         format_param = req_format
3739
3740                 # We have audio
3741                 self._downloader.increment_downloads()
3742                 try:
3743                         # Process file information
3744                         self._downloader.process_info({
3745                                 'id':           file_id.decode('utf-8'),
3746                                 'url':          file_url.decode('utf-8'),
3747                                 'uploader':     uploader.decode('utf-8'),
3748                                 'upload_date':  u'NA',
3749                                 'title':        json_data['name'],
3750                                 'stitle':       _simplify_title(json_data['name']),
3751                                 'ext':          file_url.split('.')[-1].decode('utf-8'),
3752                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3753                                 'thumbnail':    json_data['thumbnail_url'],
3754                                 'description':  json_data['description'],
3755                                 'player_url':   player_url.decode('utf-8'),
3756                         })
3757                 except UnavailableVideoError, err:
3758                         self._downloader.trouble(u'ERROR: unable to download file')
3759
3760 class StanfordOpenClassroomIE(InfoExtractor):
3761         """Information extractor for Stanford's Open ClassRoom"""
3762
3763         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3764         IE_NAME = u'stanfordoc'
3765
3766         def report_download_webpage(self, objid):
3767                 """Report information extraction."""
3768                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3769
3770         def report_extraction(self, video_id):
3771                 """Report information extraction."""
3772                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3773
3774         def _real_extract(self, url):
3775                 mobj = re.match(self._VALID_URL, url)
3776                 if mobj is None:
3777                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3778                         return
3779
3780                 if mobj.group('course') and mobj.group('video'): # A specific video
3781                         course = mobj.group('course')
3782                         video = mobj.group('video')
3783                         info = {
3784                                 'id': _simplify_title(course + '_' + video),
3785                         }
3786         
3787                         self.report_extraction(info['id'])
3788                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3789                         xmlUrl = baseUrl + video + '.xml'
3790                         try:
3791                                 metaXml = urllib2.urlopen(xmlUrl).read()
3792                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3793                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3794                                 return
3795                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3796                         try:
3797                                 info['title'] = mdoc.findall('./title')[0].text
3798                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3799                         except IndexError:
3800                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3801                                 return
3802                         info['stitle'] = _simplify_title(info['title'])
3803                         info['ext'] = info['url'].rpartition('.')[2]
3804                         info['format'] = info['ext']
3805                         self._downloader.increment_downloads()
3806                         try:
3807                                 self._downloader.process_info(info)
3808                         except UnavailableVideoError, err:
3809                                 self._downloader.trouble(u'\nERROR: unable to download video')
3810                 elif mobj.group('course'): # A course page
3811                         unescapeHTML = HTMLParser.HTMLParser().unescape
3812
3813                         course = mobj.group('course')
3814                         info = {
3815                                 'id': _simplify_title(course),
3816                                 'type': 'playlist',
3817                         }
3818
3819                         self.report_download_webpage(info['id'])
3820                         try:
3821                                 coursepage = urllib2.urlopen(url).read()
3822                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3823                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3824                                 return
3825
3826                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3827                         if m:
3828                                 info['title'] = unescapeHTML(m.group(1))
3829                         else:
3830                                 info['title'] = info['id']
3831                         info['stitle'] = _simplify_title(info['title'])
3832
3833                         m = re.search('<description>([^<]+)</description>', coursepage)
3834                         if m:
3835                                 info['description'] = unescapeHTML(m.group(1))
3836
3837                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3838                         info['list'] = [
3839                                 {
3840                                         'type': 'reference',
3841                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3842                                 }
3843                                         for vpage in links]
3844
3845                         for entry in info['list']:
3846                                 assert entry['type'] == 'reference'
3847                                 self.extract(entry['url'])
3848                 else: # Root page
3849                         unescapeHTML = HTMLParser.HTMLParser().unescape
3850
3851                         info = {
3852                                 'id': 'Stanford OpenClassroom',
3853                                 'type': 'playlist',
3854                         }
3855
3856                         self.report_download_webpage(info['id'])
3857                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3858                         try:
3859                                 rootpage = urllib2.urlopen(rootURL).read()
3860                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3861                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3862                                 return
3863
3864                         info['title'] = info['id']
3865                         info['stitle'] = _simplify_title(info['title'])
3866
3867                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3868                         info['list'] = [
3869                                 {
3870                                         'type': 'reference',
3871                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3872                                 }
3873                                         for cpage in links]
3874
3875                         for entry in info['list']:
3876                                 assert entry['type'] == 'reference'
3877                                 self.extract(entry['url'])
3878
3879
3880 class PostProcessor(object):
3881         """Post Processor class.
3882
3883         PostProcessor objects can be added to downloaders with their
3884         add_post_processor() method. When the downloader has finished a
3885         successful download, it will take its internal chain of PostProcessors
3886         and start calling the run() method on each one of them, first with
3887         an initial argument and then with the returned value of the previous
3888         PostProcessor.
3889
3890         The chain will be stopped if one of them ever returns None or the end
3891         of the chain is reached.
3892
3893         PostProcessor objects follow a "mutual registration" process similar
3894         to InfoExtractor objects.
3895         """
3896
3897         _downloader = None
3898
3899         def __init__(self, downloader=None):
3900                 self._downloader = downloader
3901
3902         def set_downloader(self, downloader):
3903                 """Sets the downloader for this PP."""
3904                 self._downloader = downloader
3905
3906         def run(self, information):
3907                 """Run the PostProcessor.
3908
3909                 The "information" argument is a dictionary like the ones
3910                 composed by InfoExtractors. The only difference is that this
3911                 one has an extra field called "filepath" that points to the
3912                 downloaded file.
3913
3914                 When this method returns None, the postprocessing chain is
3915                 stopped. However, this method may return an information
3916                 dictionary that will be passed to the next postprocessing
3917                 object in the chain. It can be the one it received after
3918                 changing some fields.
3919
3920                 In addition, this method may raise a PostProcessingError
3921                 exception that will be taken into account by the downloader
3922                 it was called from.
3923                 """
3924                 return information # by default, do nothing
3925
3926
3927 class FFmpegExtractAudioPP(PostProcessor):
3928
3929         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3930                 PostProcessor.__init__(self, downloader)
3931                 if preferredcodec is None:
3932                         preferredcodec = 'best'
3933                 self._preferredcodec = preferredcodec
3934                 self._preferredquality = preferredquality
3935                 self._keepvideo = keepvideo
3936
3937         @staticmethod
3938         def get_audio_codec(path):
3939                 try:
3940                         cmd = ['ffprobe', '-show_streams', '--', path]
3941                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3942                         output = handle.communicate()[0]
3943                         if handle.wait() != 0:
3944                                 return None
3945                 except (IOError, OSError):
3946                         return None
3947                 audio_codec = None
3948                 for line in output.split('\n'):
3949                         if line.startswith('codec_name='):
3950                                 audio_codec = line.split('=')[1].strip()
3951                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3952                                 return audio_codec
3953                 return None
3954
3955         @staticmethod
3956         def run_ffmpeg(path, out_path, codec, more_opts):
3957                 try:
3958                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3959                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3960                         return (ret == 0)
3961                 except (IOError, OSError):
3962                         return False
3963
3964         def run(self, information):
3965                 path = information['filepath']
3966
3967                 filecodec = self.get_audio_codec(path)
3968                 if filecodec is None:
3969                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3970                         return None
3971
3972                 more_opts = []
3973                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3974                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
3975                                 # Lossless, but in another container
3976                                 acodec = 'copy'
3977                                 extension = self._preferredcodec
3978                                 more_opts = ['-absf', 'aac_adtstoasc']
3979                         elif filecodec in ['aac', 'mp3', 'vorbis']:
3980                                 # Lossless if possible
3981                                 acodec = 'copy'
3982                                 extension = filecodec
3983                                 if filecodec == 'aac':
3984                                         more_opts = ['-f', 'adts']
3985                                 if filecodec == 'vorbis':
3986                                         extension = 'ogg'
3987                         else:
3988                                 # MP3 otherwise.
3989                                 acodec = 'libmp3lame'
3990                                 extension = 'mp3'
3991                                 more_opts = []
3992                                 if self._preferredquality is not None:
3993                                         more_opts += ['-ab', self._preferredquality]
3994                 else:
3995                         # We convert the audio (lossy)
3996                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3997                         extension = self._preferredcodec
3998                         more_opts = []
3999                         if self._preferredquality is not None:
4000                                 more_opts += ['-ab', self._preferredquality]
4001                         if self._preferredcodec == 'aac':
4002                                 more_opts += ['-f', 'adts']
4003                         if self._preferredcodec == 'm4a':
4004                                 more_opts += ['-absf', 'aac_adtstoasc']
4005                         if self._preferredcodec == 'vorbis':
4006                                 extension = 'ogg'
4007
4008                 (prefix, ext) = os.path.splitext(path)
4009                 new_path = prefix + '.' + extension
4010                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4011                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
4012
4013                 if not status:
4014                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
4015                         return None
4016
4017                 # Try to update the date time for extracted audio file.
4018                 if information.get('filetime') is not None:
4019                         try:
4020                                 os.utime(new_path, (time.time(), information['filetime']))
4021                         except:
4022                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4023
4024                 if not self._keepvideo:
4025                         try:
4026                                 os.remove(path)
4027                         except (IOError, OSError):
4028                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4029                                 return None
4030
4031                 information['filepath'] = new_path
4032                 return information
4033
4034
4035 def updateSelf(downloader, filename):
4036         ''' Update the program file with the latest version from the repository '''
4037         # Note: downloader only used for options
4038         if not os.access(filename, os.W_OK):
4039                 sys.exit('ERROR: no write permissions on %s' % filename)
4040
4041         downloader.to_screen('Updating to latest version...')
4042
4043         try:
4044                 try:
4045                         urlh = urllib.urlopen(UPDATE_URL)
4046                         newcontent = urlh.read()
4047                         
4048                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4049                         if vmatch is not None and vmatch.group(1) == __version__:
4050                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4051                                 return
4052                 finally:
4053                         urlh.close()
4054         except (IOError, OSError), err:
4055                 sys.exit('ERROR: unable to download latest version')
4056
4057         try:
4058                 outf = open(filename, 'wb')
4059                 try:
4060                         outf.write(newcontent)
4061                 finally:
4062                         outf.close()
4063         except (IOError, OSError), err:
4064                 sys.exit('ERROR: unable to overwrite current version')
4065
4066         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4067
4068 def parseOpts():
4069         # Deferred imports
4070         import getpass
4071         import optparse
4072         import shlex
4073
4074         def _readOptions(filename):
4075                 try:
4076                         optionf = open(filename)
4077                 except IOError:
4078                         return [] # silently skip if file is not present
4079                 try:
4080                         res = []
4081                         for l in optionf:
4082                                 res += shlex.split(l, comments=True)
4083                 finally:
4084                         optionf.close()
4085                 return res
4086
4087         def _format_option_string(option):
4088                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4089
4090                 opts = []
4091
4092                 if option._short_opts: opts.append(option._short_opts[0])
4093                 if option._long_opts: opts.append(option._long_opts[0])
4094                 if len(opts) > 1: opts.insert(1, ', ')
4095
4096                 if option.takes_value(): opts.append(' %s' % option.metavar)
4097
4098                 return "".join(opts)
4099
4100         def _find_term_columns():
4101                 columns = os.environ.get('COLUMNS', None)
4102                 if columns:
4103                         return int(columns)
4104
4105                 try:
4106                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4107                         out,err = sp.communicate()
4108                         return int(out.split()[1])
4109                 except:
4110                         pass
4111                 return None
4112
4113         max_width = 80
4114         max_help_position = 80
4115
4116         # No need to wrap help messages if we're on a wide console
4117         columns = _find_term_columns()
4118         if columns: max_width = columns
4119
4120         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4121         fmt.format_option_strings = _format_option_string
4122
4123         kw = {
4124                 'version'   : __version__,
4125                 'formatter' : fmt,
4126                 'usage' : '%prog [options] url [url...]',
4127                 'conflict_handler' : 'resolve',
4128         }
4129
4130         parser = optparse.OptionParser(**kw)
4131
4132         # option groups
4133         general        = optparse.OptionGroup(parser, 'General Options')
4134         selection      = optparse.OptionGroup(parser, 'Video Selection')
4135         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4136         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4137         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4138         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4139         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4140
4141         general.add_option('-h', '--help',
4142                         action='help', help='print this help text and exit')
4143         general.add_option('-v', '--version',
4144                         action='version', help='print program version and exit')
4145         general.add_option('-U', '--update',
4146                         action='store_true', dest='update_self', help='update this program to latest version')
4147         general.add_option('-i', '--ignore-errors',
4148                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4149         general.add_option('-r', '--rate-limit',
4150                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4151         general.add_option('-R', '--retries',
4152                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4153         general.add_option('--dump-user-agent',
4154                         action='store_true', dest='dump_user_agent',
4155                         help='display the current browser identification', default=False)
4156         general.add_option('--list-extractors',
4157                         action='store_true', dest='list_extractors',
4158                         help='List all supported extractors and the URLs they would handle', default=False)
4159
4160         selection.add_option('--playlist-start',
4161                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4162         selection.add_option('--playlist-end',
4163                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4164         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4165         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4166         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4167
4168         authentication.add_option('-u', '--username',
4169                         dest='username', metavar='USERNAME', help='account username')
4170         authentication.add_option('-p', '--password',
4171                         dest='password', metavar='PASSWORD', help='account password')
4172         authentication.add_option('-n', '--netrc',
4173                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4174
4175
4176         video_format.add_option('-f', '--format',
4177                         action='store', dest='format', metavar='FORMAT', help='video format code')
4178         video_format.add_option('--all-formats',
4179                         action='store_const', dest='format', help='download all available video formats', const='all')
4180         video_format.add_option('--prefer-free-formats',
4181                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4182         video_format.add_option('--max-quality',
4183                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4184         video_format.add_option('-F', '--list-formats',
4185                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4186
4187
4188         verbosity.add_option('-q', '--quiet',
4189                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4190         verbosity.add_option('-s', '--simulate',
4191                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4192         verbosity.add_option('--skip-download',
4193                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4194         verbosity.add_option('-g', '--get-url',
4195                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4196         verbosity.add_option('-e', '--get-title',
4197                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4198         verbosity.add_option('--get-thumbnail',
4199                         action='store_true', dest='getthumbnail',
4200                         help='simulate, quiet but print thumbnail URL', default=False)
4201         verbosity.add_option('--get-description',
4202                         action='store_true', dest='getdescription',
4203                         help='simulate, quiet but print video description', default=False)
4204         verbosity.add_option('--get-filename',
4205                         action='store_true', dest='getfilename',
4206                         help='simulate, quiet but print output filename', default=False)
4207         verbosity.add_option('--get-format',
4208                         action='store_true', dest='getformat',
4209                         help='simulate, quiet but print output format', default=False)
4210         verbosity.add_option('--no-progress',
4211                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4212         verbosity.add_option('--console-title',
4213                         action='store_true', dest='consoletitle',
4214                         help='display progress in console titlebar', default=False)
4215
4216
4217         filesystem.add_option('-t', '--title',
4218                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4219         filesystem.add_option('-l', '--literal',
4220                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4221         filesystem.add_option('-A', '--auto-number',
4222                         action='store_true', dest='autonumber',
4223                         help='number downloaded files starting from 00000', default=False)
4224         filesystem.add_option('-o', '--output',
4225                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4226         filesystem.add_option('-a', '--batch-file',
4227                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4228         filesystem.add_option('-w', '--no-overwrites',
4229                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4230         filesystem.add_option('-c', '--continue',
4231                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4232         filesystem.add_option('--no-continue',
4233                         action='store_false', dest='continue_dl',
4234                         help='do not resume partially downloaded files (restart from beginning)')
4235         filesystem.add_option('--cookies',
4236                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4237         filesystem.add_option('--no-part',
4238                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4239         filesystem.add_option('--no-mtime',
4240                         action='store_false', dest='updatetime',
4241                         help='do not use the Last-modified header to set the file modification time', default=True)
4242         filesystem.add_option('--write-description',
4243                         action='store_true', dest='writedescription',
4244                         help='write video description to a .description file', default=False)
4245         filesystem.add_option('--write-info-json',
4246                         action='store_true', dest='writeinfojson',
4247                         help='write video metadata to a .info.json file', default=False)
4248
4249
4250         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4251                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4252         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4253                         help='"best", "aac", "vorbis", "mp3", or "m4a"; best by default')
4254         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4255                         help='ffmpeg audio bitrate specification, 128k by default')
4256         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4257                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4258
4259
4260         parser.add_option_group(general)
4261         parser.add_option_group(selection)
4262         parser.add_option_group(filesystem)
4263         parser.add_option_group(verbosity)
4264         parser.add_option_group(video_format)
4265         parser.add_option_group(authentication)
4266         parser.add_option_group(postproc)
4267
4268         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4269         if xdg_config_home:
4270                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4271         else:
4272                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4273         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4274         opts, args = parser.parse_args(argv)
4275
4276         return parser, opts, args
4277
4278 def gen_extractors():
4279         """ Return a list of an instance of every supported extractor.
4280         The order does matter; the first extractor matched is the one handling the URL.
4281         """
4282         youtube_ie = YoutubeIE()
4283         google_ie = GoogleIE()
4284         yahoo_ie = YahooIE()
4285         return [
4286                 YoutubePlaylistIE(youtube_ie),
4287                 YoutubeUserIE(youtube_ie),
4288                 YoutubeSearchIE(youtube_ie),
4289                 youtube_ie,
4290                 MetacafeIE(youtube_ie),
4291                 DailymotionIE(),
4292                 google_ie,
4293                 GoogleSearchIE(google_ie),
4294                 PhotobucketIE(),
4295                 yahoo_ie,
4296                 YahooSearchIE(yahoo_ie),
4297                 DepositFilesIE(),
4298                 FacebookIE(),
4299                 BlipTVIE(),
4300                 VimeoIE(),
4301                 MyVideoIE(),
4302                 ComedyCentralIE(),
4303                 EscapistIE(),
4304                 CollegeHumorIE(),
4305                 XVideosIE(),
4306                 SoundcloudIE(),
4307                 InfoQIE(),
4308                 MixcloudIE(),
4309                 StanfordOpenClassroomIE(),
4310
4311                 GenericIE()
4312         ]
4313
4314 def _real_main():
4315         parser, opts, args = parseOpts()
4316
4317         # Open appropriate CookieJar
4318         if opts.cookiefile is None:
4319                 jar = cookielib.CookieJar()
4320         else:
4321                 try:
4322                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4323                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4324                                 jar.load()
4325                 except (IOError, OSError), err:
4326                         sys.exit(u'ERROR: unable to open cookie file')
4327
4328         # Dump user agent
4329         if opts.dump_user_agent:
4330                 print std_headers['User-Agent']
4331                 sys.exit(0)
4332
4333         # Batch file verification
4334         batchurls = []
4335         if opts.batchfile is not None:
4336                 try:
4337                         if opts.batchfile == '-':
4338                                 batchfd = sys.stdin
4339                         else:
4340                                 batchfd = open(opts.batchfile, 'r')
4341                         batchurls = batchfd.readlines()
4342                         batchurls = [x.strip() for x in batchurls]
4343                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4344                 except IOError:
4345                         sys.exit(u'ERROR: batch file could not be read')
4346         all_urls = batchurls + args
4347
4348         # General configuration
4349         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4350         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4351         urllib2.install_opener(opener)
4352         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4353
4354         extractors = gen_extractors()
4355
4356         if opts.list_extractors:
4357                 for ie in extractors:
4358                         print(ie.IE_NAME)
4359                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4360                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4361                         for mu in matchedUrls:
4362                                 print(u'  ' + mu)
4363                 sys.exit(0)
4364
4365         # Conflicting, missing and erroneous options
4366         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4367                 parser.error(u'using .netrc conflicts with giving username/password')
4368         if opts.password is not None and opts.username is None:
4369                 parser.error(u'account username missing')
4370         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4371                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4372         if opts.usetitle and opts.useliteral:
4373                 parser.error(u'using title conflicts with using literal title')
4374         if opts.username is not None and opts.password is None:
4375                 opts.password = getpass.getpass(u'Type account password and press return:')
4376         if opts.ratelimit is not None:
4377                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4378                 if numeric_limit is None:
4379                         parser.error(u'invalid rate limit specified')
4380                 opts.ratelimit = numeric_limit
4381         if opts.retries is not None:
4382                 try:
4383                         opts.retries = long(opts.retries)
4384                 except (TypeError, ValueError), err:
4385                         parser.error(u'invalid retry count specified')
4386         try:
4387                 opts.playliststart = int(opts.playliststart)
4388                 if opts.playliststart <= 0:
4389                         raise ValueError(u'Playlist start must be positive')
4390         except (TypeError, ValueError), err:
4391                 parser.error(u'invalid playlist start number specified')
4392         try:
4393                 opts.playlistend = int(opts.playlistend)
4394                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4395                         raise ValueError(u'Playlist end must be greater than playlist start')
4396         except (TypeError, ValueError), err:
4397                 parser.error(u'invalid playlist end number specified')
4398         if opts.extractaudio:
4399                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a']:
4400                         parser.error(u'invalid audio format specified')
4401
4402         # File downloader
4403         fd = FileDownloader({
4404                 'usenetrc': opts.usenetrc,
4405                 'username': opts.username,
4406                 'password': opts.password,
4407                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4408                 'forceurl': opts.geturl,
4409                 'forcetitle': opts.gettitle,
4410                 'forcethumbnail': opts.getthumbnail,
4411                 'forcedescription': opts.getdescription,
4412                 'forcefilename': opts.getfilename,
4413                 'forceformat': opts.getformat,
4414                 'simulate': opts.simulate,
4415                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4416                 'format': opts.format,
4417                 'format_limit': opts.format_limit,
4418                 'listformats': opts.listformats,
4419                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4420                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4421                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4422                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4423                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4424                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4425                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4426                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4427                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4428                         or u'%(id)s.%(ext)s'),
4429                 'ignoreerrors': opts.ignoreerrors,
4430                 'ratelimit': opts.ratelimit,
4431                 'nooverwrites': opts.nooverwrites,
4432                 'retries': opts.retries,
4433                 'continuedl': opts.continue_dl,
4434                 'noprogress': opts.noprogress,
4435                 'playliststart': opts.playliststart,
4436                 'playlistend': opts.playlistend,
4437                 'logtostderr': opts.outtmpl == '-',
4438                 'consoletitle': opts.consoletitle,
4439                 'nopart': opts.nopart,
4440                 'updatetime': opts.updatetime,
4441                 'writedescription': opts.writedescription,
4442                 'writeinfojson': opts.writeinfojson,
4443                 'matchtitle': opts.matchtitle,
4444                 'rejecttitle': opts.rejecttitle,
4445                 'max_downloads': opts.max_downloads,
4446                 'prefer_free_formats': opts.prefer_free_formats,
4447                 })
4448         for extractor in extractors:
4449                 fd.add_info_extractor(extractor)
4450
4451         # PostProcessors
4452         if opts.extractaudio:
4453                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4454
4455         # Update version
4456         if opts.update_self:
4457                 updateSelf(fd, sys.argv[0])
4458
4459         # Maybe do nothing
4460         if len(all_urls) < 1:
4461                 if not opts.update_self:
4462                         parser.error(u'you must provide at least one URL')
4463                 else:
4464                         sys.exit()
4465         
4466         try:
4467                 retcode = fd.download(all_urls)
4468         except MaxDownloadsReached:
4469                 fd.to_screen(u'--max-download limit reached, aborting.')
4470                 retcode = 101
4471
4472         # Dump cookie jar if requested
4473         if opts.cookiefile is not None:
4474                 try:
4475                         jar.save()
4476                 except (IOError, OSError), err:
4477                         sys.exit(u'ERROR: unable to save cookie jar')
4478
4479         sys.exit(retcode)
4480
4481 def main():
4482         try:
4483                 _real_main()
4484         except DownloadError:
4485                 sys.exit(1)
4486         except SameFileError:
4487                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4488         except KeyboardInterrupt:
4489                 sys.exit(u'\nERROR: Interrupted by user')
4490
4491 if __name__ == '__main__':
4492         main()
4493
4494 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: