First tests
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         )
18
19 __license__ = 'Public Domain'
20 __version__ = '2011.10.19'
21
22 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
23
24 import cookielib
25 import datetime
26 import gzip
27 import htmlentitydefs
28 import HTMLParser
29 import httplib
30 import locale
31 import math
32 import netrc
33 import os
34 import os.path
35 import re
36 import socket
37 import string
38 import subprocess
39 import sys
40 import time
41 import urllib
42 import urllib2
43 import warnings
44 import zlib
45
46 if os.name == 'nt':
47         import ctypes
48
49 try:
50         import email.utils
51 except ImportError: # Python 2.4
52         import email.Utils
53 try:
54         import cStringIO as StringIO
55 except ImportError:
56         import StringIO
57
58 # parse_qs was moved from the cgi module to the urlparse module recently.
59 try:
60         from urlparse import parse_qs
61 except ImportError:
62         from cgi import parse_qs
63
64 try:
65         import lxml.etree
66 except ImportError:
67         pass # Handled below
68
69 try:
70         import xml.etree.ElementTree
71 except ImportError: # Python<2.5: Not officially supported, but let it slip
72         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73
74 std_headers = {
75         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
76         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
77         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
78         'Accept-Encoding': 'gzip, deflate',
79         'Accept-Language': 'en-us,en;q=0.5',
80 }
81
82 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
83
84 try:
85         import json
86 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
87         import re
88         class json(object):
89                 @staticmethod
90                 def loads(s):
91                         s = s.decode('UTF-8')
92                         def raiseError(msg, i):
93                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
94                         def skipSpace(i, expectMore=True):
95                                 while i < len(s) and s[i] in ' \t\r\n':
96                                         i += 1
97                                 if expectMore:
98                                         if i >= len(s):
99                                                 raiseError('Premature end', i)
100                                 return i
101                         def decodeEscape(match):
102                                 esc = match.group(1)
103                                 _STATIC = {
104                                         '"': '"',
105                                         '\\': '\\',
106                                         '/': '/',
107                                         'b': unichr(0x8),
108                                         'f': unichr(0xc),
109                                         'n': '\n',
110                                         'r': '\r',
111                                         't': '\t',
112                                 }
113                                 if esc in _STATIC:
114                                         return _STATIC[esc]
115                                 if esc[0] == 'u':
116                                         if len(esc) == 1+4:
117                                                 return unichr(int(esc[1:5], 16))
118                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
119                                                 hi = int(esc[1:5], 16)
120                                                 low = int(esc[7:11], 16)
121                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
122                                 raise ValueError('Unknown escape ' + str(esc))
123                         def parseString(i):
124                                 i += 1
125                                 e = i
126                                 while True:
127                                         e = s.index('"', e)
128                                         bslashes = 0
129                                         while s[e-bslashes-1] == '\\':
130                                                 bslashes += 1
131                                         if bslashes % 2 == 1:
132                                                 e += 1
133                                                 continue
134                                         break
135                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
136                                 stri = rexp.sub(decodeEscape, s[i:e])
137                                 return (e+1,stri)
138                         def parseObj(i):
139                                 i += 1
140                                 res = {}
141                                 i = skipSpace(i)
142                                 if s[i] == '}': # Empty dictionary
143                                         return (i+1,res)
144                                 while True:
145                                         if s[i] != '"':
146                                                 raiseError('Expected a string object key', i)
147                                         i,key = parseString(i)
148                                         i = skipSpace(i)
149                                         if i >= len(s) or s[i] != ':':
150                                                 raiseError('Expected a colon', i)
151                                         i,val = parse(i+1)
152                                         res[key] = val
153                                         i = skipSpace(i)
154                                         if s[i] == '}':
155                                                 return (i+1, res)
156                                         if s[i] != ',':
157                                                 raiseError('Expected comma or closing curly brace', i)
158                                         i = skipSpace(i+1)
159                         def parseArray(i):
160                                 res = []
161                                 i = skipSpace(i+1)
162                                 if s[i] == ']': # Empty array
163                                         return (i+1,res)
164                                 while True:
165                                         i,val = parse(i)
166                                         res.append(val)
167                                         i = skipSpace(i) # Raise exception if premature end
168                                         if s[i] == ']':
169                                                 return (i+1, res)
170                                         if s[i] != ',':
171                                                 raiseError('Expected a comma or closing bracket', i)
172                                         i = skipSpace(i+1)
173                         def parseDiscrete(i):
174                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
175                                         if s.startswith(k, i):
176                                                 return (i+len(k), v)
177                                 raiseError('Not a boolean (or null)', i)
178                         def parseNumber(i):
179                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
180                                 if mobj is None:
181                                         raiseError('Not a number', i)
182                                 nums = mobj.group(1)
183                                 if '.' in nums or 'e' in nums or 'E' in nums:
184                                         return (i+len(nums), float(nums))
185                                 return (i+len(nums), int(nums))
186                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187                         def parse(i):
188                                 i = skipSpace(i)
189                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
190                                 i = skipSpace(i, False)
191                                 return (i,res)
192                         i,res = parse(0)
193                         if i < len(s):
194                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195                         return res
196
197 def preferredencoding():
198         """Get preferred encoding.
199
200         Returns the best encoding scheme for the system, based on
201         locale.getpreferredencoding() and some further tweaks.
202         """
203         def yield_preferredencoding():
204                 try:
205                         pref = locale.getpreferredencoding()
206                         u'TEST'.encode(pref)
207                 except:
208                         pref = 'UTF-8'
209                 while True:
210                         yield pref
211         return yield_preferredencoding().next()
212
213
214 def htmlentity_transform(matchobj):
215         """Transforms an HTML entity to a Unicode character.
216
217         This function receives a match object and is intended to be used with
218         the re.sub() function.
219         """
220         entity = matchobj.group(1)
221
222         # Known non-numeric HTML entity
223         if entity in htmlentitydefs.name2codepoint:
224                 return unichr(htmlentitydefs.name2codepoint[entity])
225
226         # Unicode character
227         mobj = re.match(ur'(?u)#(x?\d+)', entity)
228         if mobj is not None:
229                 numstr = mobj.group(1)
230                 if numstr.startswith(u'x'):
231                         base = 16
232                         numstr = u'0%s' % numstr
233                 else:
234                         base = 10
235                 return unichr(long(numstr, base))
236
237         # Unknown entity in name, return its literal representation
238         return (u'&%s;' % entity)
239
240
241 def sanitize_title(utitle):
242         """Sanitizes a video title so it could be used as part of a filename."""
243         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
244         return utitle.replace(unicode(os.sep), u'%')
245
246
247 def sanitize_open(filename, open_mode):
248         """Try to open the given filename, and slightly tweak it if this fails.
249
250         Attempts to open the given filename. If this fails, it tries to change
251         the filename slightly, step by step, until it's either able to open it
252         or it fails and raises a final exception, like the standard open()
253         function.
254
255         It returns the tuple (stream, definitive_file_name).
256         """
257         try:
258                 if filename == u'-':
259                         if sys.platform == 'win32':
260                                 import msvcrt
261                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
262                         return (sys.stdout, filename)
263                 stream = open(filename, open_mode)
264                 return (stream, filename)
265         except (IOError, OSError), err:
266                 # In case of error, try to remove win32 forbidden chars
267                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
268
269                 # An exception here should be caught in the caller
270                 stream = open(filename, open_mode)
271                 return (stream, filename)
272
273
274 def timeconvert(timestr):
275         """Convert RFC 2822 defined time string into system timestamp"""
276         timestamp = None
277         timetuple = email.utils.parsedate_tz(timestr)
278         if timetuple is not None:
279                 timestamp = email.utils.mktime_tz(timetuple)
280         return timestamp
281
282 def _simplify_title(title):
283         return re.sub(ur'[^\w\d_\-]+', u'_', title)
284
285 class DownloadError(Exception):
286         """Download Error exception.
287
288         This exception may be thrown by FileDownloader objects if they are not
289         configured to continue on errors. They will contain the appropriate
290         error message.
291         """
292         pass
293
294
295 class SameFileError(Exception):
296         """Same File exception.
297
298         This exception will be thrown by FileDownloader objects if they detect
299         multiple files would have to be downloaded to the same file on disk.
300         """
301         pass
302
303
304 class PostProcessingError(Exception):
305         """Post Processing exception.
306
307         This exception may be raised by PostProcessor's .run() method to
308         indicate an error in the postprocessing task.
309         """
310         pass
311
312
313 class UnavailableVideoError(Exception):
314         """Unavailable Format exception.
315
316         This exception will be thrown when a video is requested
317         in a format that is not available for that video.
318         """
319         pass
320
321
322 class ContentTooShortError(Exception):
323         """Content Too Short exception.
324
325         This exception may be raised by FileDownloader objects when a file they
326         download is too small for what the server announced first, indicating
327         the connection was probably interrupted.
328         """
329         # Both in bytes
330         downloaded = None
331         expected = None
332
333         def __init__(self, downloaded, expected):
334                 self.downloaded = downloaded
335                 self.expected = expected
336
337
338 class YoutubeDLHandler(urllib2.HTTPHandler):
339         """Handler for HTTP requests and responses.
340
341         This class, when installed with an OpenerDirector, automatically adds
342         the standard headers to every HTTP request and handles gzipped and
343         deflated responses from web servers. If compression is to be avoided in
344         a particular request, the original request in the program code only has
345         to include the HTTP header "Youtubedl-No-Compression", which will be
346         removed before making the real request.
347
348         Part of this code was copied from:
349
350         http://techknack.net/python-urllib2-handlers/
351
352         Andrew Rowls, the author of that code, agreed to release it to the
353         public domain.
354         """
355
356         @staticmethod
357         def deflate(data):
358                 try:
359                         return zlib.decompress(data, -zlib.MAX_WBITS)
360                 except zlib.error:
361                         return zlib.decompress(data)
362
363         @staticmethod
364         def addinfourl_wrapper(stream, headers, url, code):
365                 if hasattr(urllib2.addinfourl, 'getcode'):
366                         return urllib2.addinfourl(stream, headers, url, code)
367                 ret = urllib2.addinfourl(stream, headers, url)
368                 ret.code = code
369                 return ret
370
371         def http_request(self, req):
372                 for h in std_headers:
373                         if h in req.headers:
374                                 del req.headers[h]
375                         req.add_header(h, std_headers[h])
376                 if 'Youtubedl-no-compression' in req.headers:
377                         if 'Accept-encoding' in req.headers:
378                                 del req.headers['Accept-encoding']
379                         del req.headers['Youtubedl-no-compression']
380                 return req
381
382         def http_response(self, req, resp):
383                 old_resp = resp
384                 # gzip
385                 if resp.headers.get('Content-encoding', '') == 'gzip':
386                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
387                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
388                         resp.msg = old_resp.msg
389                 # deflate
390                 if resp.headers.get('Content-encoding', '') == 'deflate':
391                         gz = StringIO.StringIO(self.deflate(resp.read()))
392                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
393                         resp.msg = old_resp.msg
394                 return resp
395
396
397 class FileDownloader(object):
398         """File Downloader class.
399
400         File downloader objects are the ones responsible of downloading the
401         actual video file and writing it to disk if the user has requested
402         it, among some other tasks. In most cases there should be one per
403         program. As, given a video URL, the downloader doesn't know how to
404         extract all the needed information, task that InfoExtractors do, it
405         has to pass the URL to one of them.
406
407         For this, file downloader objects have a method that allows
408         InfoExtractors to be registered in a given order. When it is passed
409         a URL, the file downloader handles it to the first InfoExtractor it
410         finds that reports being able to handle it. The InfoExtractor extracts
411         all the information about the video or videos the URL refers to, and
412         asks the FileDownloader to process the video information, possibly
413         downloading the video.
414
415         File downloaders accept a lot of parameters. In order not to saturate
416         the object constructor with arguments, it receives a dictionary of
417         options instead. These options are available through the params
418         attribute for the InfoExtractors to use. The FileDownloader also
419         registers itself as the downloader in charge for the InfoExtractors
420         that are added to it, so this is a "mutual registration".
421
422         Available options:
423
424         username:         Username for authentication purposes.
425         password:         Password for authentication purposes.
426         usenetrc:         Use netrc for authentication instead.
427         quiet:            Do not print messages to stdout.
428         forceurl:         Force printing final URL.
429         forcetitle:       Force printing title.
430         forcethumbnail:   Force printing thumbnail URL.
431         forcedescription: Force printing description.
432         forcefilename:    Force printing final filename.
433         simulate:         Do not download the video files.
434         format:           Video format code.
435         format_limit:     Highest quality format to try.
436         outtmpl:          Template for output names.
437         ignoreerrors:     Do not stop on download errors.
438         ratelimit:        Download speed limit, in bytes/sec.
439         nooverwrites:     Prevent overwriting files.
440         retries:          Number of times to retry for HTTP error 5xx
441         continuedl:       Try to continue downloads if possible.
442         noprogress:       Do not print the progress bar.
443         playliststart:    Playlist item to start at.
444         playlistend:      Playlist item to end at.
445         matchtitle:       Download only matching titles.
446         rejecttitle:      Reject downloads for matching titles.
447         logtostderr:      Log messages to stderr instead of stdout.
448         consoletitle:     Display progress in console window's titlebar.
449         nopart:           Do not use temporary .part files.
450         updatetime:       Use the Last-modified header to set output file timestamps.
451         writedescription: Write the video description to a .description file
452         writeinfojson:    Write the video description to a .info.json file
453         """
454
455         params = None
456         _ies = []
457         _pps = []
458         _download_retcode = None
459         _num_downloads = None
460         _screen_file = None
461
462         def __init__(self, params):
463                 """Create a FileDownloader object with the given options."""
464                 self._ies = []
465                 self._pps = []
466                 self._download_retcode = 0
467                 self._num_downloads = 0
468                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
469                 self.params = params
470
471         @staticmethod
472         def format_bytes(bytes):
473                 if bytes is None:
474                         return 'N/A'
475                 if type(bytes) is str:
476                         bytes = float(bytes)
477                 if bytes == 0.0:
478                         exponent = 0
479                 else:
480                         exponent = long(math.log(bytes, 1024.0))
481                 suffix = 'bkMGTPEZY'[exponent]
482                 converted = float(bytes) / float(1024 ** exponent)
483                 return '%.2f%s' % (converted, suffix)
484
485         @staticmethod
486         def calc_percent(byte_counter, data_len):
487                 if data_len is None:
488                         return '---.-%'
489                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
490
491         @staticmethod
492         def calc_eta(start, now, total, current):
493                 if total is None:
494                         return '--:--'
495                 dif = now - start
496                 if current == 0 or dif < 0.001: # One millisecond
497                         return '--:--'
498                 rate = float(current) / dif
499                 eta = long((float(total) - float(current)) / rate)
500                 (eta_mins, eta_secs) = divmod(eta, 60)
501                 if eta_mins > 99:
502                         return '--:--'
503                 return '%02d:%02d' % (eta_mins, eta_secs)
504
505         @staticmethod
506         def calc_speed(start, now, bytes):
507                 dif = now - start
508                 if bytes == 0 or dif < 0.001: # One millisecond
509                         return '%10s' % '---b/s'
510                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
511
512         @staticmethod
513         def best_block_size(elapsed_time, bytes):
514                 new_min = max(bytes / 2.0, 1.0)
515                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
516                 if elapsed_time < 0.001:
517                         return long(new_max)
518                 rate = bytes / elapsed_time
519                 if rate > new_max:
520                         return long(new_max)
521                 if rate < new_min:
522                         return long(new_min)
523                 return long(rate)
524
525         @staticmethod
526         def parse_bytes(bytestr):
527                 """Parse a string indicating a byte quantity into a long integer."""
528                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
529                 if matchobj is None:
530                         return None
531                 number = float(matchobj.group(1))
532                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
533                 return long(round(number * multiplier))
534
535         def add_info_extractor(self, ie):
536                 """Add an InfoExtractor object to the end of the list."""
537                 self._ies.append(ie)
538                 ie.set_downloader(self)
539
540         def add_post_processor(self, pp):
541                 """Add a PostProcessor object to the end of the chain."""
542                 self._pps.append(pp)
543                 pp.set_downloader(self)
544
545         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
546                 """Print message to stdout if not in quiet mode."""
547                 try:
548                         if not self.params.get('quiet', False):
549                                 terminator = [u'\n', u''][skip_eol]
550                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
551                         self._screen_file.flush()
552                 except (UnicodeEncodeError), err:
553                         if not ignore_encoding_errors:
554                                 raise
555
556         def to_stderr(self, message):
557                 """Print message to stderr."""
558                 print >>sys.stderr, message.encode(preferredencoding())
559
560         def to_cons_title(self, message):
561                 """Set console/terminal window title to message."""
562                 if not self.params.get('consoletitle', False):
563                         return
564                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
565                         # c_wchar_p() might not be necessary if `message` is
566                         # already of type unicode()
567                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
568                 elif 'TERM' in os.environ:
569                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
570
571         def fixed_template(self):
572                 """Checks if the output template is fixed."""
573                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
574
575         def trouble(self, message=None):
576                 """Determine action to take when a download problem appears.
577
578                 Depending on if the downloader has been configured to ignore
579                 download errors or not, this method may throw an exception or
580                 not when errors are found, after printing the message.
581                 """
582                 if message is not None:
583                         self.to_stderr(message)
584                 if not self.params.get('ignoreerrors', False):
585                         raise DownloadError(message)
586                 self._download_retcode = 1
587
588         def slow_down(self, start_time, byte_counter):
589                 """Sleep if the download speed is over the rate limit."""
590                 rate_limit = self.params.get('ratelimit', None)
591                 if rate_limit is None or byte_counter == 0:
592                         return
593                 now = time.time()
594                 elapsed = now - start_time
595                 if elapsed <= 0.0:
596                         return
597                 speed = float(byte_counter) / elapsed
598                 if speed > rate_limit:
599                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
600
601         def temp_name(self, filename):
602                 """Returns a temporary filename for the given filename."""
603                 if self.params.get('nopart', False) or filename == u'-' or \
604                                 (os.path.exists(filename) and not os.path.isfile(filename)):
605                         return filename
606                 return filename + u'.part'
607
608         def undo_temp_name(self, filename):
609                 if filename.endswith(u'.part'):
610                         return filename[:-len(u'.part')]
611                 return filename
612
613         def try_rename(self, old_filename, new_filename):
614                 try:
615                         if old_filename == new_filename:
616                                 return
617                         os.rename(old_filename, new_filename)
618                 except (IOError, OSError), err:
619                         self.trouble(u'ERROR: unable to rename file')
620
621         def try_utime(self, filename, last_modified_hdr):
622                 """Try to set the last-modified time of the given file."""
623                 if last_modified_hdr is None:
624                         return
625                 if not os.path.isfile(filename):
626                         return
627                 timestr = last_modified_hdr
628                 if timestr is None:
629                         return
630                 filetime = timeconvert(timestr)
631                 if filetime is None:
632                         return filetime
633                 try:
634                         os.utime(filename, (time.time(), filetime))
635                 except:
636                         pass
637                 return filetime
638
639         def report_writedescription(self, descfn):
640                 """ Report that the description file is being written """
641                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
642
643         def report_writeinfojson(self, infofn):
644                 """ Report that the metadata file has been written """
645                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
646
647         def report_destination(self, filename):
648                 """Report destination filename."""
649                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
650
651         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
652                 """Report download progress."""
653                 if self.params.get('noprogress', False):
654                         return
655                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
656                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
657                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
658                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
659
660         def report_resuming_byte(self, resume_len):
661                 """Report attempt to resume at given byte."""
662                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
663
664         def report_retry(self, count, retries):
665                 """Report retry in case of HTTP error 5xx"""
666                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
667
668         def report_file_already_downloaded(self, file_name):
669                 """Report file has already been fully downloaded."""
670                 try:
671                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
672                 except (UnicodeEncodeError), err:
673                         self.to_screen(u'[download] The file has already been downloaded')
674
675         def report_unable_to_resume(self):
676                 """Report it was impossible to resume download."""
677                 self.to_screen(u'[download] Unable to resume')
678
679         def report_finish(self):
680                 """Report download finished."""
681                 if self.params.get('noprogress', False):
682                         self.to_screen(u'[download] Download completed')
683                 else:
684                         self.to_screen(u'')
685
686         def increment_downloads(self):
687                 """Increment the ordinal that assigns a number to each file."""
688                 self._num_downloads += 1
689
690         def prepare_filename(self, info_dict):
691                 """Generate the output filename."""
692                 try:
693                         template_dict = dict(info_dict)
694                         template_dict['epoch'] = unicode(long(time.time()))
695                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
696                         filename = self.params['outtmpl'] % template_dict
697                         return filename
698                 except (ValueError, KeyError), err:
699                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
700                         return None
701
702         def process_info(self, info_dict):
703                 """Process a single dictionary returned by an InfoExtractor."""
704                 filename = self.prepare_filename(info_dict)
705                 
706                 # Forced printings
707                 if self.params.get('forcetitle', False):
708                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
709                 if self.params.get('forceurl', False):
710                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
711                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
712                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
713                 if self.params.get('forcedescription', False) and 'description' in info_dict:
714                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
715                 if self.params.get('forcefilename', False) and filename is not None:
716                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
717                 if self.params.get('forceformat', False):
718                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
719
720                 # Do nothing else if in simulate mode
721                 if self.params.get('simulate', False):
722                         return
723
724                 if filename is None:
725                         return
726
727                 matchtitle=self.params.get('matchtitle',False)
728                 rejecttitle=self.params.get('rejecttitle',False)
729                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
730                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
731                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
732                         return
733                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
734                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
735                         return
736                         
737                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
738                         self.to_stderr(u'WARNING: file exists and will be skipped')
739                         return
740
741                 try:
742                         dn = os.path.dirname(filename)
743                         if dn != '' and not os.path.exists(dn):
744                                 os.makedirs(dn)
745                 except (OSError, IOError), err:
746                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
747                         return
748
749                 if self.params.get('writedescription', False):
750                         try:
751                                 descfn = filename + '.description'
752                                 self.report_writedescription(descfn)
753                                 descfile = open(descfn, 'wb')
754                                 try:
755                                         descfile.write(info_dict['description'].encode('utf-8'))
756                                 finally:
757                                         descfile.close()
758                         except (OSError, IOError):
759                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
760                                 return
761
762                 if self.params.get('writeinfojson', False):
763                         infofn = filename + '.info.json'
764                         self.report_writeinfojson(infofn)
765                         try:
766                                 json.dump
767                         except (NameError,AttributeError):
768                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
769                                 return
770                         try:
771                                 infof = open(infofn, 'wb')
772                                 try:
773                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
774                                         json.dump(json_info_dict, infof)
775                                 finally:
776                                         infof.close()
777                         except (OSError, IOError):
778                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
779                                 return
780
781                 if not self.params.get('skip_download', False):
782                         try:
783                                 success = self._do_download(filename, info_dict)
784                         except (OSError, IOError), err:
785                                 raise UnavailableVideoError
786                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
787                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
788                                 return
789                         except (ContentTooShortError, ), err:
790                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
791                                 return
792         
793                         if success:
794                                 try:
795                                         self.post_process(filename, info_dict)
796                                 except (PostProcessingError), err:
797                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
798                                         return
799
800         def download(self, url_list):
801                 """Download a given list of URLs."""
802                 if len(url_list) > 1 and self.fixed_template():
803                         raise SameFileError(self.params['outtmpl'])
804
805                 for url in url_list:
806                         suitable_found = False
807                         for ie in self._ies:
808                                 # Go to next InfoExtractor if not suitable
809                                 if not ie.suitable(url):
810                                         continue
811
812                                 # Suitable InfoExtractor found
813                                 suitable_found = True
814
815                                 # Extract information from URL and process it
816                                 ie.extract(url)
817
818                                 # Suitable InfoExtractor had been found; go to next URL
819                                 break
820
821                         if not suitable_found:
822                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
823
824                 return self._download_retcode
825
826         def post_process(self, filename, ie_info):
827                 """Run the postprocessing chain on the given file."""
828                 info = dict(ie_info)
829                 info['filepath'] = filename
830                 for pp in self._pps:
831                         info = pp.run(info)
832                         if info is None:
833                                 break
834
835         def _download_with_rtmpdump(self, filename, url, player_url):
836                 self.report_destination(filename)
837                 tmpfilename = self.temp_name(filename)
838
839                 # Check for rtmpdump first
840                 try:
841                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
842                 except (OSError, IOError):
843                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
844                         return False
845
846                 # Download using rtmpdump. rtmpdump returns exit code 2 when
847                 # the connection was interrumpted and resuming appears to be
848                 # possible. This is part of rtmpdump's normal usage, AFAIK.
849                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
850                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
851                 while retval == 2 or retval == 1:
852                         prevsize = os.path.getsize(tmpfilename)
853                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
854                         time.sleep(5.0) # This seems to be needed
855                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
856                         cursize = os.path.getsize(tmpfilename)
857                         if prevsize == cursize and retval == 1:
858                                 break
859                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
860                         if prevsize == cursize and retval == 2 and cursize > 1024:
861                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
862                                 retval = 0
863                                 break
864                 if retval == 0:
865                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
866                         self.try_rename(tmpfilename, filename)
867                         return True
868                 else:
869                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
870                         return False
871
872         def _do_download(self, filename, info_dict):
873                 url = info_dict['url']
874                 player_url = info_dict.get('player_url', None)
875
876                 # Check file already present
877                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
878                         self.report_file_already_downloaded(filename)
879                         return True
880
881                 # Attempt to download using rtmpdump
882                 if url.startswith('rtmp'):
883                         return self._download_with_rtmpdump(filename, url, player_url)
884
885                 tmpfilename = self.temp_name(filename)
886                 stream = None
887
888                 # Do not include the Accept-Encoding header
889                 headers = {'Youtubedl-no-compression': 'True'}
890                 basic_request = urllib2.Request(url, None, headers)
891                 request = urllib2.Request(url, None, headers)
892
893                 # Establish possible resume length
894                 if os.path.isfile(tmpfilename):
895                         resume_len = os.path.getsize(tmpfilename)
896                 else:
897                         resume_len = 0
898
899                 open_mode = 'wb'
900                 if resume_len != 0:
901                         if self.params.get('continuedl', False):
902                                 self.report_resuming_byte(resume_len)
903                                 request.add_header('Range','bytes=%d-' % resume_len)
904                                 open_mode = 'ab'
905                         else:
906                                 resume_len = 0
907
908                 count = 0
909                 retries = self.params.get('retries', 0)
910                 while count <= retries:
911                         # Establish connection
912                         try:
913                                 if count == 0 and 'urlhandle' in info_dict:
914                                         data = info_dict['urlhandle']
915                                 data = urllib2.urlopen(request)
916                                 break
917                         except (urllib2.HTTPError, ), err:
918                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
919                                         # Unexpected HTTP error
920                                         raise
921                                 elif err.code == 416:
922                                         # Unable to resume (requested range not satisfiable)
923                                         try:
924                                                 # Open the connection again without the range header
925                                                 data = urllib2.urlopen(basic_request)
926                                                 content_length = data.info()['Content-Length']
927                                         except (urllib2.HTTPError, ), err:
928                                                 if err.code < 500 or err.code >= 600:
929                                                         raise
930                                         else:
931                                                 # Examine the reported length
932                                                 if (content_length is not None and
933                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
934                                                         # The file had already been fully downloaded.
935                                                         # Explanation to the above condition: in issue #175 it was revealed that
936                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
937                                                         # changing the file size slightly and causing problems for some users. So
938                                                         # I decided to implement a suggested change and consider the file
939                                                         # completely downloaded if the file size differs less than 100 bytes from
940                                                         # the one in the hard drive.
941                                                         self.report_file_already_downloaded(filename)
942                                                         self.try_rename(tmpfilename, filename)
943                                                         return True
944                                                 else:
945                                                         # The length does not match, we start the download over
946                                                         self.report_unable_to_resume()
947                                                         open_mode = 'wb'
948                                                         break
949                         # Retry
950                         count += 1
951                         if count <= retries:
952                                 self.report_retry(count, retries)
953
954                 if count > retries:
955                         self.trouble(u'ERROR: giving up after %s retries' % retries)
956                         return False
957
958                 data_len = data.info().get('Content-length', None)
959                 if data_len is not None:
960                         data_len = long(data_len) + resume_len
961                 data_len_str = self.format_bytes(data_len)
962                 byte_counter = 0 + resume_len
963                 block_size = 1024
964                 start = time.time()
965                 while True:
966                         # Download and write
967                         before = time.time()
968                         data_block = data.read(block_size)
969                         after = time.time()
970                         if len(data_block) == 0:
971                                 break
972                         byte_counter += len(data_block)
973
974                         # Open file just in time
975                         if stream is None:
976                                 try:
977                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
978                                         assert stream is not None
979                                         filename = self.undo_temp_name(tmpfilename)
980                                         self.report_destination(filename)
981                                 except (OSError, IOError), err:
982                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
983                                         return False
984                         try:
985                                 stream.write(data_block)
986                         except (IOError, OSError), err:
987                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
988                                 return False
989                         block_size = self.best_block_size(after - before, len(data_block))
990
991                         # Progress message
992                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
993                         if data_len is None:
994                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
995                         else:
996                                 percent_str = self.calc_percent(byte_counter, data_len)
997                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
998                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
999
1000                         # Apply rate limit
1001                         self.slow_down(start, byte_counter - resume_len)
1002
1003                 if stream is None:
1004                         self.trouble(u'\nERROR: Did not get any data blocks')
1005                         return False
1006                 stream.close()
1007                 self.report_finish()
1008                 if data_len is not None and byte_counter != data_len:
1009                         raise ContentTooShortError(byte_counter, long(data_len))
1010                 self.try_rename(tmpfilename, filename)
1011
1012                 # Update file modification time
1013                 if self.params.get('updatetime', True):
1014                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1015
1016                 return True
1017
1018
1019 class InfoExtractor(object):
1020         """Information Extractor class.
1021
1022         Information extractors are the classes that, given a URL, extract
1023         information from the video (or videos) the URL refers to. This
1024         information includes the real video URL, the video title and simplified
1025         title, author and others. The information is stored in a dictionary
1026         which is then passed to the FileDownloader. The FileDownloader
1027         processes this information possibly downloading the video to the file
1028         system, among other possible outcomes. The dictionaries must include
1029         the following fields:
1030
1031         id:             Video identifier.
1032         url:            Final video URL.
1033         uploader:       Nickname of the video uploader.
1034         title:          Literal title.
1035         stitle:         Simplified title.
1036         ext:            Video filename extension.
1037         format:         Video format.
1038         player_url:     SWF Player URL (may be None).
1039
1040         The following fields are optional. Their primary purpose is to allow
1041         youtube-dl to serve as the backend for a video search function, such
1042         as the one in youtube2mp3.  They are only used when their respective
1043         forced printing functions are called:
1044
1045         thumbnail:      Full URL to a video thumbnail image.
1046         description:    One-line video description.
1047
1048         Subclasses of this one should re-define the _real_initialize() and
1049         _real_extract() methods and define a _VALID_URL regexp.
1050         Probably, they should also be added to the list of extractors.
1051         """
1052
1053         _ready = False
1054         _downloader = None
1055
1056         def __init__(self, downloader=None):
1057                 """Constructor. Receives an optional downloader."""
1058                 self._ready = False
1059                 self.set_downloader(downloader)
1060
1061         def suitable(self, url):
1062                 """Receives a URL and returns True if suitable for this IE."""
1063                 return re.match(self._VALID_URL, url) is not None
1064
1065         def initialize(self):
1066                 """Initializes an instance (authentication, etc)."""
1067                 if not self._ready:
1068                         self._real_initialize()
1069                         self._ready = True
1070
1071         def extract(self, url):
1072                 """Extracts URL information and returns it in list of dicts."""
1073                 self.initialize()
1074                 return self._real_extract(url)
1075
1076         def set_downloader(self, downloader):
1077                 """Sets the downloader for this IE."""
1078                 self._downloader = downloader
1079
1080         def _real_initialize(self):
1081                 """Real initialization process. Redefine in subclasses."""
1082                 pass
1083
1084         def _real_extract(self, url):
1085                 """Real extraction process. Redefine in subclasses."""
1086                 pass
1087
1088
1089 class YoutubeIE(InfoExtractor):
1090         """Information extractor for youtube.com."""
1091
1092         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1093         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1094         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1095         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1096         _NETRC_MACHINE = 'youtube'
1097         # Listed in order of quality
1098         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1099         _video_extensions = {
1100                 '13': '3gp',
1101                 '17': 'mp4',
1102                 '18': 'mp4',
1103                 '22': 'mp4',
1104                 '37': 'mp4',
1105                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1106                 '43': 'webm',
1107                 '44': 'webm',
1108                 '45': 'webm',
1109         }
1110         _video_dimensions = {
1111                 '5': '240x400',
1112                 '6': '???',
1113                 '13': '???',
1114                 '17': '144x176',
1115                 '18': '360x640',
1116                 '22': '720x1280',
1117                 '34': '360x640',
1118                 '35': '480x854',
1119                 '37': '1080x1920',
1120                 '38': '3072x4096',
1121                 '43': '360x640',
1122                 '44': '480x854',
1123                 '45': '720x1280',
1124         }       
1125         IE_NAME = u'youtube'
1126
1127         def report_lang(self):
1128                 """Report attempt to set language."""
1129                 self._downloader.to_screen(u'[youtube] Setting language')
1130
1131         def report_login(self):
1132                 """Report attempt to log in."""
1133                 self._downloader.to_screen(u'[youtube] Logging in')
1134
1135         def report_age_confirmation(self):
1136                 """Report attempt to confirm age."""
1137                 self._downloader.to_screen(u'[youtube] Confirming age')
1138
1139         def report_video_webpage_download(self, video_id):
1140                 """Report attempt to download video webpage."""
1141                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1142
1143         def report_video_info_webpage_download(self, video_id):
1144                 """Report attempt to download video info webpage."""
1145                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1146
1147         def report_information_extraction(self, video_id):
1148                 """Report attempt to extract video information."""
1149                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1150
1151         def report_unavailable_format(self, video_id, format):
1152                 """Report extracted video URL."""
1153                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1154
1155         def report_rtmp_download(self):
1156                 """Indicate the download will use the RTMP protocol."""
1157                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1158
1159         def _print_formats(self, formats):
1160                 print 'Available formats:'
1161                 for x in formats:
1162                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1163
1164         def _real_initialize(self):
1165                 if self._downloader is None:
1166                         return
1167
1168                 username = None
1169                 password = None
1170                 downloader_params = self._downloader.params
1171
1172                 # Attempt to use provided username and password or .netrc data
1173                 if downloader_params.get('username', None) is not None:
1174                         username = downloader_params['username']
1175                         password = downloader_params['password']
1176                 elif downloader_params.get('usenetrc', False):
1177                         try:
1178                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1179                                 if info is not None:
1180                                         username = info[0]
1181                                         password = info[2]
1182                                 else:
1183                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1184                         except (IOError, netrc.NetrcParseError), err:
1185                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1186                                 return
1187
1188                 # Set language
1189                 request = urllib2.Request(self._LANG_URL)
1190                 try:
1191                         self.report_lang()
1192                         urllib2.urlopen(request).read()
1193                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1195                         return
1196
1197                 # No authentication to be performed
1198                 if username is None:
1199                         return
1200
1201                 # Log in
1202                 login_form = {
1203                                 'current_form': 'loginForm',
1204                                 'next':         '/',
1205                                 'action_login': 'Log In',
1206                                 'username':     username,
1207                                 'password':     password,
1208                                 }
1209                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1210                 try:
1211                         self.report_login()
1212                         login_results = urllib2.urlopen(request).read()
1213                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1214                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1215                                 return
1216                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1217                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1218                         return
1219
1220                 # Confirm age
1221                 age_form = {
1222                                 'next_url':             '/',
1223                                 'action_confirm':       'Confirm',
1224                                 }
1225                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1226                 try:
1227                         self.report_age_confirmation()
1228                         age_results = urllib2.urlopen(request).read()
1229                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1230                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1231                         return
1232
1233         def _real_extract(self, url):
1234                 # Extract video id from URL
1235                 mobj = re.match(self._VALID_URL, url)
1236                 if mobj is None:
1237                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1238                         return
1239                 video_id = mobj.group(2)
1240
1241                 # Get video webpage
1242                 self.report_video_webpage_download(video_id)
1243                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1244                 try:
1245                         video_webpage = urllib2.urlopen(request).read()
1246                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1247                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1248                         return
1249
1250                 # Attempt to extract SWF player URL
1251                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1252                 if mobj is not None:
1253                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1254                 else:
1255                         player_url = None
1256
1257                 # Get video info
1258                 self.report_video_info_webpage_download(video_id)
1259                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1260                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1261                                         % (video_id, el_type))
1262                         request = urllib2.Request(video_info_url)
1263                         try:
1264                                 video_info_webpage = urllib2.urlopen(request).read()
1265                                 video_info = parse_qs(video_info_webpage)
1266                                 if 'token' in video_info:
1267                                         break
1268                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1269                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1270                                 return
1271                 if 'token' not in video_info:
1272                         if 'reason' in video_info:
1273                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1274                         else:
1275                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1276                         return
1277
1278                 # Start extracting information
1279                 self.report_information_extraction(video_id)
1280
1281                 # uploader
1282                 if 'author' not in video_info:
1283                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1284                         return
1285                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1286
1287                 # title
1288                 if 'title' not in video_info:
1289                         self._downloader.trouble(u'ERROR: unable to extract video title')
1290                         return
1291                 video_title = urllib.unquote_plus(video_info['title'][0])
1292                 video_title = video_title.decode('utf-8')
1293                 video_title = sanitize_title(video_title)
1294
1295                 # simplified title
1296                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1297                 simple_title = simple_title.strip(ur'_')
1298
1299                 # thumbnail image
1300                 if 'thumbnail_url' not in video_info:
1301                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1302                         video_thumbnail = ''
1303                 else:   # don't panic if we can't find it
1304                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1305
1306                 # upload date
1307                 upload_date = u'NA'
1308                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1309                 if mobj is not None:
1310                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1311                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1312                         for expression in format_expressions:
1313                                 try:
1314                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1315                                 except:
1316                                         pass
1317
1318                 # description
1319                 try:
1320                         lxml.etree
1321                 except NameError:
1322                         video_description = u'No description available.'
1323                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1324                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1325                                 if mobj is not None:
1326                                         video_description = mobj.group(1).decode('utf-8')
1327                 else:
1328                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1329                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1330                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1331                         # TODO use another parser
1332
1333                 # token
1334                 video_token = urllib.unquote_plus(video_info['token'][0])
1335
1336                 # Decide which formats to download
1337                 req_format = self._downloader.params.get('format', None)
1338
1339                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1340                         self.report_rtmp_download()
1341                         video_url_list = [(None, video_info['conn'][0])]
1342                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1343                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1344                         url_data = [parse_qs(uds) for uds in url_data_strs]
1345                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1346                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1347
1348                         format_limit = self._downloader.params.get('format_limit', None)
1349                         if format_limit is not None and format_limit in self._available_formats:
1350                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1351                         else:
1352                                 format_list = self._available_formats
1353                         existing_formats = [x for x in format_list if x in url_map]
1354                         if len(existing_formats) == 0:
1355                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1356                                 return
1357                         if self._downloader.params.get('listformats', None):
1358                                 self._print_formats(existing_formats)
1359                                 return
1360                         if req_format is None or req_format == 'best':
1361                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1362                         elif req_format == 'worst':
1363                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1364                         elif req_format in ('-1', 'all'):
1365                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1366                         else:
1367                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1368                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1369                                 req_formats = req_format.split('/')
1370                                 video_url_list = None
1371                                 for rf in req_formats:
1372                                         if rf in url_map:
1373                                                 video_url_list = [(rf, url_map[rf])]
1374                                                 break
1375                                 if video_url_list is None:
1376                                         self._downloader.trouble(u'ERROR: requested format not available')
1377                                         return
1378                 else:
1379                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1380                         return
1381
1382                 for format_param, video_real_url in video_url_list:
1383                         # At this point we have a new video
1384                         self._downloader.increment_downloads()
1385
1386                         # Extension
1387                         video_extension = self._video_extensions.get(format_param, 'flv')
1388
1389                         try:
1390                                 # Process video information
1391                                 self._downloader.process_info({
1392                                         'id':           video_id.decode('utf-8'),
1393                                         'url':          video_real_url.decode('utf-8'),
1394                                         'uploader':     video_uploader.decode('utf-8'),
1395                                         'upload_date':  upload_date,
1396                                         'title':        video_title,
1397                                         'stitle':       simple_title,
1398                                         'ext':          video_extension.decode('utf-8'),
1399                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1400                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1401                                         'description':  video_description,
1402                                         'player_url':   player_url,
1403                                 })
1404                         except UnavailableVideoError, err:
1405                                 self._downloader.trouble(u'\nERROR: unable to download video')
1406
1407
1408 class MetacafeIE(InfoExtractor):
1409         """Information Extractor for metacafe.com."""
1410
1411         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1412         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1413         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1414         _youtube_ie = None
1415         IE_NAME = u'metacafe'
1416
1417         def __init__(self, youtube_ie, downloader=None):
1418                 InfoExtractor.__init__(self, downloader)
1419                 self._youtube_ie = youtube_ie
1420
1421         def report_disclaimer(self):
1422                 """Report disclaimer retrieval."""
1423                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1424
1425         def report_age_confirmation(self):
1426                 """Report attempt to confirm age."""
1427                 self._downloader.to_screen(u'[metacafe] Confirming age')
1428
1429         def report_download_webpage(self, video_id):
1430                 """Report webpage download."""
1431                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1432
1433         def report_extraction(self, video_id):
1434                 """Report information extraction."""
1435                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1436
1437         def _real_initialize(self):
1438                 # Retrieve disclaimer
1439                 request = urllib2.Request(self._DISCLAIMER)
1440                 try:
1441                         self.report_disclaimer()
1442                         disclaimer = urllib2.urlopen(request).read()
1443                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1444                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1445                         return
1446
1447                 # Confirm age
1448                 disclaimer_form = {
1449                         'filters': '0',
1450                         'submit': "Continue - I'm over 18",
1451                         }
1452                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1453                 try:
1454                         self.report_age_confirmation()
1455                         disclaimer = urllib2.urlopen(request).read()
1456                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1457                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1458                         return
1459
1460         def _real_extract(self, url):
1461                 # Extract id and simplified title from URL
1462                 mobj = re.match(self._VALID_URL, url)
1463                 if mobj is None:
1464                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1465                         return
1466
1467                 video_id = mobj.group(1)
1468
1469                 # Check if video comes from YouTube
1470                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1471                 if mobj2 is not None:
1472                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1473                         return
1474
1475                 # At this point we have a new video
1476                 self._downloader.increment_downloads()
1477
1478                 simple_title = mobj.group(2).decode('utf-8')
1479
1480                 # Retrieve video webpage to extract further information
1481                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1482                 try:
1483                         self.report_download_webpage(video_id)
1484                         webpage = urllib2.urlopen(request).read()
1485                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1486                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1487                         return
1488
1489                 # Extract URL, uploader and title from webpage
1490                 self.report_extraction(video_id)
1491                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1492                 if mobj is not None:
1493                         mediaURL = urllib.unquote(mobj.group(1))
1494                         video_extension = mediaURL[-3:]
1495
1496                         # Extract gdaKey if available
1497                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1498                         if mobj is None:
1499                                 video_url = mediaURL
1500                         else:
1501                                 gdaKey = mobj.group(1)
1502                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1503                 else:
1504                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1505                         if mobj is None:
1506                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1507                                 return
1508                         vardict = parse_qs(mobj.group(1))
1509                         if 'mediaData' not in vardict:
1510                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1511                                 return
1512                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1513                         if mobj is None:
1514                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1515                                 return
1516                         mediaURL = mobj.group(1).replace('\\/', '/')
1517                         video_extension = mediaURL[-3:]
1518                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1519
1520                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1521                 if mobj is None:
1522                         self._downloader.trouble(u'ERROR: unable to extract title')
1523                         return
1524                 video_title = mobj.group(1).decode('utf-8')
1525                 video_title = sanitize_title(video_title)
1526
1527                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1528                 if mobj is None:
1529                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1530                         return
1531                 video_uploader = mobj.group(1)
1532
1533                 try:
1534                         # Process video information
1535                         self._downloader.process_info({
1536                                 'id':           video_id.decode('utf-8'),
1537                                 'url':          video_url.decode('utf-8'),
1538                                 'uploader':     video_uploader.decode('utf-8'),
1539                                 'upload_date':  u'NA',
1540                                 'title':        video_title,
1541                                 'stitle':       simple_title,
1542                                 'ext':          video_extension.decode('utf-8'),
1543                                 'format':       u'NA',
1544                                 'player_url':   None,
1545                         })
1546                 except UnavailableVideoError:
1547                         self._downloader.trouble(u'\nERROR: unable to download video')
1548
1549
1550 class DailymotionIE(InfoExtractor):
1551         """Information Extractor for Dailymotion"""
1552
1553         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1554         IE_NAME = u'dailymotion'
1555
1556         def __init__(self, downloader=None):
1557                 InfoExtractor.__init__(self, downloader)
1558
1559         def report_download_webpage(self, video_id):
1560                 """Report webpage download."""
1561                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1562
1563         def report_extraction(self, video_id):
1564                 """Report information extraction."""
1565                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1566
1567         def _real_extract(self, url):
1568                 # Extract id and simplified title from URL
1569                 mobj = re.match(self._VALID_URL, url)
1570                 if mobj is None:
1571                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1572                         return
1573
1574                 # At this point we have a new video
1575                 self._downloader.increment_downloads()
1576                 video_id = mobj.group(1)
1577
1578                 simple_title = mobj.group(2).decode('utf-8')
1579                 video_extension = 'flv'
1580
1581                 # Retrieve video webpage to extract further information
1582                 request = urllib2.Request(url)
1583                 request.add_header('Cookie', 'family_filter=off')
1584                 try:
1585                         self.report_download_webpage(video_id)
1586                         webpage = urllib2.urlopen(request).read()
1587                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1588                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1589                         return
1590
1591                 # Extract URL, uploader and title from webpage
1592                 self.report_extraction(video_id)
1593                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1594                 if mobj is None:
1595                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1596                         return
1597                 sequence = urllib.unquote(mobj.group(1))
1598                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1599                 if mobj is None:
1600                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1601                         return
1602                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1603
1604                 # if needed add http://www.dailymotion.com/ if relative URL
1605
1606                 video_url = mediaURL
1607
1608                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1609                 if mobj is None:
1610                         self._downloader.trouble(u'ERROR: unable to extract title')
1611                         return
1612                 video_title = mobj.group(1).decode('utf-8')
1613                 video_title = sanitize_title(video_title)
1614
1615                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1616                 if mobj is None:
1617                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1618                         return
1619                 video_uploader = mobj.group(1)
1620
1621                 try:
1622                         # Process video information
1623                         self._downloader.process_info({
1624                                 'id':           video_id.decode('utf-8'),
1625                                 'url':          video_url.decode('utf-8'),
1626                                 'uploader':     video_uploader.decode('utf-8'),
1627                                 'upload_date':  u'NA',
1628                                 'title':        video_title,
1629                                 'stitle':       simple_title,
1630                                 'ext':          video_extension.decode('utf-8'),
1631                                 'format':       u'NA',
1632                                 'player_url':   None,
1633                         })
1634                 except UnavailableVideoError:
1635                         self._downloader.trouble(u'\nERROR: unable to download video')
1636
1637
1638 class GoogleIE(InfoExtractor):
1639         """Information extractor for video.google.com."""
1640
1641         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1642         IE_NAME = u'video.google'
1643
1644         def __init__(self, downloader=None):
1645                 InfoExtractor.__init__(self, downloader)
1646
1647         def report_download_webpage(self, video_id):
1648                 """Report webpage download."""
1649                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1650
1651         def report_extraction(self, video_id):
1652                 """Report information extraction."""
1653                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1654
1655         def _real_extract(self, url):
1656                 # Extract id from URL
1657                 mobj = re.match(self._VALID_URL, url)
1658                 if mobj is None:
1659                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1660                         return
1661
1662                 # At this point we have a new video
1663                 self._downloader.increment_downloads()
1664                 video_id = mobj.group(1)
1665
1666                 video_extension = 'mp4'
1667
1668                 # Retrieve video webpage to extract further information
1669                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1670                 try:
1671                         self.report_download_webpage(video_id)
1672                         webpage = urllib2.urlopen(request).read()
1673                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1674                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1675                         return
1676
1677                 # Extract URL, uploader, and title from webpage
1678                 self.report_extraction(video_id)
1679                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1680                 if mobj is None:
1681                         video_extension = 'flv'
1682                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1683                 if mobj is None:
1684                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1685                         return
1686                 mediaURL = urllib.unquote(mobj.group(1))
1687                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1688                 mediaURL = mediaURL.replace('\\x26', '\x26')
1689
1690                 video_url = mediaURL
1691
1692                 mobj = re.search(r'<title>(.*)</title>', webpage)
1693                 if mobj is None:
1694                         self._downloader.trouble(u'ERROR: unable to extract title')
1695                         return
1696                 video_title = mobj.group(1).decode('utf-8')
1697                 video_title = sanitize_title(video_title)
1698                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1699
1700                 # Extract video description
1701                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1702                 if mobj is None:
1703                         self._downloader.trouble(u'ERROR: unable to extract video description')
1704                         return
1705                 video_description = mobj.group(1).decode('utf-8')
1706                 if not video_description:
1707                         video_description = 'No description available.'
1708
1709                 # Extract video thumbnail
1710                 if self._downloader.params.get('forcethumbnail', False):
1711                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1712                         try:
1713                                 webpage = urllib2.urlopen(request).read()
1714                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1715                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1716                                 return
1717                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1718                         if mobj is None:
1719                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1720                                 return
1721                         video_thumbnail = mobj.group(1)
1722                 else:   # we need something to pass to process_info
1723                         video_thumbnail = ''
1724
1725                 try:
1726                         # Process video information
1727                         self._downloader.process_info({
1728                                 'id':           video_id.decode('utf-8'),
1729                                 'url':          video_url.decode('utf-8'),
1730                                 'uploader':     u'NA',
1731                                 'upload_date':  u'NA',
1732                                 'title':        video_title,
1733                                 'stitle':       simple_title,
1734                                 'ext':          video_extension.decode('utf-8'),
1735                                 'format':       u'NA',
1736                                 'player_url':   None,
1737                         })
1738                 except UnavailableVideoError:
1739                         self._downloader.trouble(u'\nERROR: unable to download video')
1740
1741
1742 class PhotobucketIE(InfoExtractor):
1743         """Information extractor for photobucket.com."""
1744
1745         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1746         IE_NAME = u'photobucket'
1747
1748         def __init__(self, downloader=None):
1749                 InfoExtractor.__init__(self, downloader)
1750
1751         def report_download_webpage(self, video_id):
1752                 """Report webpage download."""
1753                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1754
1755         def report_extraction(self, video_id):
1756                 """Report information extraction."""
1757                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1758
1759         def _real_extract(self, url):
1760                 # Extract id from URL
1761                 mobj = re.match(self._VALID_URL, url)
1762                 if mobj is None:
1763                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1764                         return
1765
1766                 # At this point we have a new video
1767                 self._downloader.increment_downloads()
1768                 video_id = mobj.group(1)
1769
1770                 video_extension = 'flv'
1771
1772                 # Retrieve video webpage to extract further information
1773                 request = urllib2.Request(url)
1774                 try:
1775                         self.report_download_webpage(video_id)
1776                         webpage = urllib2.urlopen(request).read()
1777                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1778                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1779                         return
1780
1781                 # Extract URL, uploader, and title from webpage
1782                 self.report_extraction(video_id)
1783                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1784                 if mobj is None:
1785                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1786                         return
1787                 mediaURL = urllib.unquote(mobj.group(1))
1788
1789                 video_url = mediaURL
1790
1791                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1792                 if mobj is None:
1793                         self._downloader.trouble(u'ERROR: unable to extract title')
1794                         return
1795                 video_title = mobj.group(1).decode('utf-8')
1796                 video_title = sanitize_title(video_title)
1797                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1798
1799                 video_uploader = mobj.group(2).decode('utf-8')
1800
1801                 try:
1802                         # Process video information
1803                         self._downloader.process_info({
1804                                 'id':           video_id.decode('utf-8'),
1805                                 'url':          video_url.decode('utf-8'),
1806                                 'uploader':     video_uploader,
1807                                 'upload_date':  u'NA',
1808                                 'title':        video_title,
1809                                 'stitle':       simple_title,
1810                                 'ext':          video_extension.decode('utf-8'),
1811                                 'format':       u'NA',
1812                                 'player_url':   None,
1813                         })
1814                 except UnavailableVideoError:
1815                         self._downloader.trouble(u'\nERROR: unable to download video')
1816
1817
1818 class YahooIE(InfoExtractor):
1819         """Information extractor for video.yahoo.com."""
1820
1821         # _VALID_URL matches all Yahoo! Video URLs
1822         # _VPAGE_URL matches only the extractable '/watch/' URLs
1823         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1824         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1825         IE_NAME = u'video.yahoo'
1826
1827         def __init__(self, downloader=None):
1828                 InfoExtractor.__init__(self, downloader)
1829
1830         def report_download_webpage(self, video_id):
1831                 """Report webpage download."""
1832                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1833
1834         def report_extraction(self, video_id):
1835                 """Report information extraction."""
1836                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1837
1838         def _real_extract(self, url, new_video=True):
1839                 # Extract ID from URL
1840                 mobj = re.match(self._VALID_URL, url)
1841                 if mobj is None:
1842                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1843                         return
1844
1845                 # At this point we have a new video
1846                 self._downloader.increment_downloads()
1847                 video_id = mobj.group(2)
1848                 video_extension = 'flv'
1849
1850                 # Rewrite valid but non-extractable URLs as
1851                 # extractable English language /watch/ URLs
1852                 if re.match(self._VPAGE_URL, url) is None:
1853                         request = urllib2.Request(url)
1854                         try:
1855                                 webpage = urllib2.urlopen(request).read()
1856                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1857                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1858                                 return
1859
1860                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1861                         if mobj is None:
1862                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1863                                 return
1864                         yahoo_id = mobj.group(1)
1865
1866                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1867                         if mobj is None:
1868                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1869                                 return
1870                         yahoo_vid = mobj.group(1)
1871
1872                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1873                         return self._real_extract(url, new_video=False)
1874
1875                 # Retrieve video webpage to extract further information
1876                 request = urllib2.Request(url)
1877                 try:
1878                         self.report_download_webpage(video_id)
1879                         webpage = urllib2.urlopen(request).read()
1880                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1881                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1882                         return
1883
1884                 # Extract uploader and title from webpage
1885                 self.report_extraction(video_id)
1886                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1887                 if mobj is None:
1888                         self._downloader.trouble(u'ERROR: unable to extract video title')
1889                         return
1890                 video_title = mobj.group(1).decode('utf-8')
1891                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1892
1893                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1894                 if mobj is None:
1895                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1896                         return
1897                 video_uploader = mobj.group(1).decode('utf-8')
1898
1899                 # Extract video thumbnail
1900                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1901                 if mobj is None:
1902                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1903                         return
1904                 video_thumbnail = mobj.group(1).decode('utf-8')
1905
1906                 # Extract video description
1907                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1908                 if mobj is None:
1909                         self._downloader.trouble(u'ERROR: unable to extract video description')
1910                         return
1911                 video_description = mobj.group(1).decode('utf-8')
1912                 if not video_description:
1913                         video_description = 'No description available.'
1914
1915                 # Extract video height and width
1916                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1917                 if mobj is None:
1918                         self._downloader.trouble(u'ERROR: unable to extract video height')
1919                         return
1920                 yv_video_height = mobj.group(1)
1921
1922                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1923                 if mobj is None:
1924                         self._downloader.trouble(u'ERROR: unable to extract video width')
1925                         return
1926                 yv_video_width = mobj.group(1)
1927
1928                 # Retrieve video playlist to extract media URL
1929                 # I'm not completely sure what all these options are, but we
1930                 # seem to need most of them, otherwise the server sends a 401.
1931                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1932                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1933                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1934                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1935                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1936                 try:
1937                         self.report_download_webpage(video_id)
1938                         webpage = urllib2.urlopen(request).read()
1939                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1940                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1941                         return
1942
1943                 # Extract media URL from playlist XML
1944                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1945                 if mobj is None:
1946                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1947                         return
1948                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1949                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1950
1951                 try:
1952                         # Process video information
1953                         self._downloader.process_info({
1954                                 'id':           video_id.decode('utf-8'),
1955                                 'url':          video_url,
1956                                 'uploader':     video_uploader,
1957                                 'upload_date':  u'NA',
1958                                 'title':        video_title,
1959                                 'stitle':       simple_title,
1960                                 'ext':          video_extension.decode('utf-8'),
1961                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1962                                 'description':  video_description,
1963                                 'thumbnail':    video_thumbnail,
1964                                 'player_url':   None,
1965                         })
1966                 except UnavailableVideoError:
1967                         self._downloader.trouble(u'\nERROR: unable to download video')
1968
1969
1970 class VimeoIE(InfoExtractor):
1971         """Information extractor for vimeo.com."""
1972
1973         # _VALID_URL matches Vimeo URLs
1974         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1975         IE_NAME = u'vimeo'
1976
1977         def __init__(self, downloader=None):
1978                 InfoExtractor.__init__(self, downloader)
1979
1980         def report_download_webpage(self, video_id):
1981                 """Report webpage download."""
1982                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1983
1984         def report_extraction(self, video_id):
1985                 """Report information extraction."""
1986                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1987
1988         def _real_extract(self, url, new_video=True):
1989                 # Extract ID from URL
1990                 mobj = re.match(self._VALID_URL, url)
1991                 if mobj is None:
1992                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1993                         return
1994
1995                 # At this point we have a new video
1996                 self._downloader.increment_downloads()
1997                 video_id = mobj.group(1)
1998
1999                 # Retrieve video webpage to extract further information
2000                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2001                 try:
2002                         self.report_download_webpage(video_id)
2003                         webpage = urllib2.urlopen(request).read()
2004                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2005                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2006                         return
2007
2008                 # Now we begin extracting as much information as we can from what we
2009                 # retrieved. First we extract the information common to all extractors,
2010                 # and latter we extract those that are Vimeo specific.
2011                 self.report_extraction(video_id)
2012
2013                 # Extract title
2014                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2015                 if mobj is None:
2016                         self._downloader.trouble(u'ERROR: unable to extract video title')
2017                         return
2018                 video_title = mobj.group(1).decode('utf-8')
2019                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2020
2021                 # Extract uploader
2022                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2023                 if mobj is None:
2024                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2025                         return
2026                 video_uploader = mobj.group(1).decode('utf-8')
2027
2028                 # Extract video thumbnail
2029                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2030                 if mobj is None:
2031                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2032                         return
2033                 video_thumbnail = mobj.group(1).decode('utf-8')
2034
2035                 # # Extract video description
2036                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2037                 # if mobj is None:
2038                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2039                 #       return
2040                 # video_description = mobj.group(1).decode('utf-8')
2041                 # if not video_description: video_description = 'No description available.'
2042                 video_description = 'Foo.'
2043
2044                 # Vimeo specific: extract request signature
2045                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2046                 if mobj is None:
2047                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2048                         return
2049                 sig = mobj.group(1).decode('utf-8')
2050
2051                 # Vimeo specific: extract video quality information
2052                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2053                 if mobj is None:
2054                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2055                         return
2056                 quality = mobj.group(1).decode('utf-8')
2057
2058                 if int(quality) == 1:
2059                         quality = 'hd'
2060                 else:
2061                         quality = 'sd'
2062
2063                 # Vimeo specific: Extract request signature expiration
2064                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2065                 if mobj is None:
2066                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2067                         return
2068                 sig_exp = mobj.group(1).decode('utf-8')
2069
2070                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2071
2072                 try:
2073                         # Process video information
2074                         self._downloader.process_info({
2075                                 'id':           video_id.decode('utf-8'),
2076                                 'url':          video_url,
2077                                 'uploader':     video_uploader,
2078                                 'upload_date':  u'NA',
2079                                 'title':        video_title,
2080                                 'stitle':       simple_title,
2081                                 'ext':          u'mp4',
2082                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2083                                 'description':  video_description,
2084                                 'thumbnail':    video_thumbnail,
2085                                 'description':  video_description,
2086                                 'player_url':   None,
2087                         })
2088                 except UnavailableVideoError:
2089                         self._downloader.trouble(u'ERROR: unable to download video')
2090
2091
2092 class GenericIE(InfoExtractor):
2093         """Generic last-resort information extractor."""
2094
2095         _VALID_URL = r'.*'
2096         IE_NAME = u'generic'
2097
2098         def __init__(self, downloader=None):
2099                 InfoExtractor.__init__(self, downloader)
2100
2101         def report_download_webpage(self, video_id):
2102                 """Report webpage download."""
2103                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2104                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2105
2106         def report_extraction(self, video_id):
2107                 """Report information extraction."""
2108                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2109
2110         def _real_extract(self, url):
2111                 # At this point we have a new video
2112                 self._downloader.increment_downloads()
2113
2114                 video_id = url.split('/')[-1]
2115                 request = urllib2.Request(url)
2116                 try:
2117                         self.report_download_webpage(video_id)
2118                         webpage = urllib2.urlopen(request).read()
2119                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2120                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2121                         return
2122                 except ValueError, err:
2123                         # since this is the last-resort InfoExtractor, if
2124                         # this error is thrown, it'll be thrown here
2125                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2126                         return
2127
2128                 self.report_extraction(video_id)
2129                 # Start with something easy: JW Player in SWFObject
2130                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2131                 if mobj is None:
2132                         # Broaden the search a little bit
2133                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2134                 if mobj is None:
2135                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2136                         return
2137
2138                 # It's possible that one of the regexes
2139                 # matched, but returned an empty group:
2140                 if mobj.group(1) is None:
2141                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2142                         return
2143
2144                 video_url = urllib.unquote(mobj.group(1))
2145                 video_id = os.path.basename(video_url)
2146
2147                 # here's a fun little line of code for you:
2148                 video_extension = os.path.splitext(video_id)[1][1:]
2149                 video_id = os.path.splitext(video_id)[0]
2150
2151                 # it's tempting to parse this further, but you would
2152                 # have to take into account all the variations like
2153                 #   Video Title - Site Name
2154                 #   Site Name | Video Title
2155                 #   Video Title - Tagline | Site Name
2156                 # and so on and so forth; it's just not practical
2157                 mobj = re.search(r'<title>(.*)</title>', webpage)
2158                 if mobj is None:
2159                         self._downloader.trouble(u'ERROR: unable to extract title')
2160                         return
2161                 video_title = mobj.group(1).decode('utf-8')
2162                 video_title = sanitize_title(video_title)
2163                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2164
2165                 # video uploader is domain name
2166                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2167                 if mobj is None:
2168                         self._downloader.trouble(u'ERROR: unable to extract title')
2169                         return
2170                 video_uploader = mobj.group(1).decode('utf-8')
2171
2172                 try:
2173                         # Process video information
2174                         self._downloader.process_info({
2175                                 'id':           video_id.decode('utf-8'),
2176                                 'url':          video_url.decode('utf-8'),
2177                                 'uploader':     video_uploader,
2178                                 'upload_date':  u'NA',
2179                                 'title':        video_title,
2180                                 'stitle':       simple_title,
2181                                 'ext':          video_extension.decode('utf-8'),
2182                                 'format':       u'NA',
2183                                 'player_url':   None,
2184                         })
2185                 except UnavailableVideoError, err:
2186                         self._downloader.trouble(u'\nERROR: unable to download video')
2187
2188
2189 class YoutubeSearchIE(InfoExtractor):
2190         """Information Extractor for YouTube search queries."""
2191         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2192         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2193         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2194         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2195         _youtube_ie = None
2196         _max_youtube_results = 1000
2197         IE_NAME = u'youtube:search'
2198
2199         def __init__(self, youtube_ie, downloader=None):
2200                 InfoExtractor.__init__(self, downloader)
2201                 self._youtube_ie = youtube_ie
2202
2203         def report_download_page(self, query, pagenum):
2204                 """Report attempt to download playlist page with given number."""
2205                 query = query.decode(preferredencoding())
2206                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2207
2208         def _real_initialize(self):
2209                 self._youtube_ie.initialize()
2210
2211         def _real_extract(self, query):
2212                 mobj = re.match(self._VALID_URL, query)
2213                 if mobj is None:
2214                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2215                         return
2216
2217                 prefix, query = query.split(':')
2218                 prefix = prefix[8:]
2219                 query = query.encode('utf-8')
2220                 if prefix == '':
2221                         self._download_n_results(query, 1)
2222                         return
2223                 elif prefix == 'all':
2224                         self._download_n_results(query, self._max_youtube_results)
2225                         return
2226                 else:
2227                         try:
2228                                 n = long(prefix)
2229                                 if n <= 0:
2230                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2231                                         return
2232                                 elif n > self._max_youtube_results:
2233                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2234                                         n = self._max_youtube_results
2235                                 self._download_n_results(query, n)
2236                                 return
2237                         except ValueError: # parsing prefix as integer fails
2238                                 self._download_n_results(query, 1)
2239                                 return
2240
2241         def _download_n_results(self, query, n):
2242                 """Downloads a specified number of results for a query"""
2243
2244                 video_ids = []
2245                 already_seen = set()
2246                 pagenum = 1
2247
2248                 while True:
2249                         self.report_download_page(query, pagenum)
2250                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2251                         request = urllib2.Request(result_url)
2252                         try:
2253                                 page = urllib2.urlopen(request).read()
2254                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2255                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2256                                 return
2257
2258                         # Extract video identifiers
2259                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2260                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2261                                 if video_id not in already_seen:
2262                                         video_ids.append(video_id)
2263                                         already_seen.add(video_id)
2264                                         if len(video_ids) == n:
2265                                                 # Specified n videos reached
2266                                                 for id in video_ids:
2267                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2268                                                 return
2269
2270                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2271                                 for id in video_ids:
2272                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2273                                 return
2274
2275                         pagenum = pagenum + 1
2276
2277
2278 class GoogleSearchIE(InfoExtractor):
2279         """Information Extractor for Google Video search queries."""
2280         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2281         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2282         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2283         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2284         _google_ie = None
2285         _max_google_results = 1000
2286         IE_NAME = u'video.google:search'
2287
2288         def __init__(self, google_ie, downloader=None):
2289                 InfoExtractor.__init__(self, downloader)
2290                 self._google_ie = google_ie
2291
2292         def report_download_page(self, query, pagenum):
2293                 """Report attempt to download playlist page with given number."""
2294                 query = query.decode(preferredencoding())
2295                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2296
2297         def _real_initialize(self):
2298                 self._google_ie.initialize()
2299
2300         def _real_extract(self, query):
2301                 mobj = re.match(self._VALID_URL, query)
2302                 if mobj is None:
2303                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2304                         return
2305
2306                 prefix, query = query.split(':')
2307                 prefix = prefix[8:]
2308                 query = query.encode('utf-8')
2309                 if prefix == '':
2310                         self._download_n_results(query, 1)
2311                         return
2312                 elif prefix == 'all':
2313                         self._download_n_results(query, self._max_google_results)
2314                         return
2315                 else:
2316                         try:
2317                                 n = long(prefix)
2318                                 if n <= 0:
2319                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2320                                         return
2321                                 elif n > self._max_google_results:
2322                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2323                                         n = self._max_google_results
2324                                 self._download_n_results(query, n)
2325                                 return
2326                         except ValueError: # parsing prefix as integer fails
2327                                 self._download_n_results(query, 1)
2328                                 return
2329
2330         def _download_n_results(self, query, n):
2331                 """Downloads a specified number of results for a query"""
2332
2333                 video_ids = []
2334                 already_seen = set()
2335                 pagenum = 1
2336
2337                 while True:
2338                         self.report_download_page(query, pagenum)
2339                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2340                         request = urllib2.Request(result_url)
2341                         try:
2342                                 page = urllib2.urlopen(request).read()
2343                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2344                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2345                                 return
2346
2347                         # Extract video identifiers
2348                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2349                                 video_id = mobj.group(1)
2350                                 if video_id not in already_seen:
2351                                         video_ids.append(video_id)
2352                                         already_seen.add(video_id)
2353                                         if len(video_ids) == n:
2354                                                 # Specified n videos reached
2355                                                 for id in video_ids:
2356                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2357                                                 return
2358
2359                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2360                                 for id in video_ids:
2361                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2362                                 return
2363
2364                         pagenum = pagenum + 1
2365
2366
2367 class YahooSearchIE(InfoExtractor):
2368         """Information Extractor for Yahoo! Video search queries."""
2369         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2370         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2371         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2372         _MORE_PAGES_INDICATOR = r'\s*Next'
2373         _yahoo_ie = None
2374         _max_yahoo_results = 1000
2375         IE_NAME = u'video.yahoo:search'
2376
2377         def __init__(self, yahoo_ie, downloader=None):
2378                 InfoExtractor.__init__(self, downloader)
2379                 self._yahoo_ie = yahoo_ie
2380
2381         def report_download_page(self, query, pagenum):
2382                 """Report attempt to download playlist page with given number."""
2383                 query = query.decode(preferredencoding())
2384                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2385
2386         def _real_initialize(self):
2387                 self._yahoo_ie.initialize()
2388
2389         def _real_extract(self, query):
2390                 mobj = re.match(self._VALID_URL, query)
2391                 if mobj is None:
2392                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2393                         return
2394
2395                 prefix, query = query.split(':')
2396                 prefix = prefix[8:]
2397                 query = query.encode('utf-8')
2398                 if prefix == '':
2399                         self._download_n_results(query, 1)
2400                         return
2401                 elif prefix == 'all':
2402                         self._download_n_results(query, self._max_yahoo_results)
2403                         return
2404                 else:
2405                         try:
2406                                 n = long(prefix)
2407                                 if n <= 0:
2408                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2409                                         return
2410                                 elif n > self._max_yahoo_results:
2411                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2412                                         n = self._max_yahoo_results
2413                                 self._download_n_results(query, n)
2414                                 return
2415                         except ValueError: # parsing prefix as integer fails
2416                                 self._download_n_results(query, 1)
2417                                 return
2418
2419         def _download_n_results(self, query, n):
2420                 """Downloads a specified number of results for a query"""
2421
2422                 video_ids = []
2423                 already_seen = set()
2424                 pagenum = 1
2425
2426                 while True:
2427                         self.report_download_page(query, pagenum)
2428                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2429                         request = urllib2.Request(result_url)
2430                         try:
2431                                 page = urllib2.urlopen(request).read()
2432                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2433                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2434                                 return
2435
2436                         # Extract video identifiers
2437                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2438                                 video_id = mobj.group(1)
2439                                 if video_id not in already_seen:
2440                                         video_ids.append(video_id)
2441                                         already_seen.add(video_id)
2442                                         if len(video_ids) == n:
2443                                                 # Specified n videos reached
2444                                                 for id in video_ids:
2445                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2446                                                 return
2447
2448                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2449                                 for id in video_ids:
2450                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2451                                 return
2452
2453                         pagenum = pagenum + 1
2454
2455
2456 class YoutubePlaylistIE(InfoExtractor):
2457         """Information Extractor for YouTube playlists."""
2458
2459         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2460         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2461         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2462         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2463         _youtube_ie = None
2464         IE_NAME = u'youtube:playlist'
2465
2466         def __init__(self, youtube_ie, downloader=None):
2467                 InfoExtractor.__init__(self, downloader)
2468                 self._youtube_ie = youtube_ie
2469
2470         def report_download_page(self, playlist_id, pagenum):
2471                 """Report attempt to download playlist page with given number."""
2472                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2473
2474         def _real_initialize(self):
2475                 self._youtube_ie.initialize()
2476
2477         def _real_extract(self, url):
2478                 # Extract playlist id
2479                 mobj = re.match(self._VALID_URL, url)
2480                 if mobj is None:
2481                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2482                         return
2483
2484                 # Single video case
2485                 if mobj.group(3) is not None:
2486                         self._youtube_ie.extract(mobj.group(3))
2487                         return
2488
2489                 # Download playlist pages
2490                 # prefix is 'p' as default for playlists but there are other types that need extra care
2491                 playlist_prefix = mobj.group(1)
2492                 if playlist_prefix == 'a':
2493                         playlist_access = 'artist'
2494                 else:
2495                         playlist_prefix = 'p'
2496                         playlist_access = 'view_play_list'
2497                 playlist_id = mobj.group(2)
2498                 video_ids = []
2499                 pagenum = 1
2500
2501                 while True:
2502                         self.report_download_page(playlist_id, pagenum)
2503                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2504                         request = urllib2.Request(url)
2505                         try:
2506                                 page = urllib2.urlopen(request).read()
2507                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2508                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2509                                 return
2510
2511                         # Extract video identifiers
2512                         ids_in_page = []
2513                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2514                                 if mobj.group(1) not in ids_in_page:
2515                                         ids_in_page.append(mobj.group(1))
2516                         video_ids.extend(ids_in_page)
2517
2518                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2519                                 break
2520                         pagenum = pagenum + 1
2521
2522                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2523                 playlistend = self._downloader.params.get('playlistend', -1)
2524                 video_ids = video_ids[playliststart:playlistend]
2525
2526                 for id in video_ids:
2527                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2528                 return
2529
2530
2531 class YoutubeUserIE(InfoExtractor):
2532         """Information Extractor for YouTube users."""
2533
2534         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2535         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2536         _GDATA_PAGE_SIZE = 50
2537         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2538         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2539         _youtube_ie = None
2540         IE_NAME = u'youtube:user'
2541
2542         def __init__(self, youtube_ie, downloader=None):
2543                 InfoExtractor.__init__(self, downloader)
2544                 self._youtube_ie = youtube_ie
2545
2546         def report_download_page(self, username, start_index):
2547                 """Report attempt to download user page."""
2548                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2549                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2550
2551         def _real_initialize(self):
2552                 self._youtube_ie.initialize()
2553
2554         def _real_extract(self, url):
2555                 # Extract username
2556                 mobj = re.match(self._VALID_URL, url)
2557                 if mobj is None:
2558                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2559                         return
2560
2561                 username = mobj.group(1)
2562
2563                 # Download video ids using YouTube Data API. Result size per
2564                 # query is limited (currently to 50 videos) so we need to query
2565                 # page by page until there are no video ids - it means we got
2566                 # all of them.
2567
2568                 video_ids = []
2569                 pagenum = 0
2570
2571                 while True:
2572                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2573                         self.report_download_page(username, start_index)
2574
2575                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2576
2577                         try:
2578                                 page = urllib2.urlopen(request).read()
2579                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2580                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2581                                 return
2582
2583                         # Extract video identifiers
2584                         ids_in_page = []
2585
2586                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2587                                 if mobj.group(1) not in ids_in_page:
2588                                         ids_in_page.append(mobj.group(1))
2589
2590                         video_ids.extend(ids_in_page)
2591
2592                         # A little optimization - if current page is not
2593                         # "full", ie. does not contain PAGE_SIZE video ids then
2594                         # we can assume that this page is the last one - there
2595                         # are no more ids on further pages - no need to query
2596                         # again.
2597
2598                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2599                                 break
2600
2601                         pagenum += 1
2602
2603                 all_ids_count = len(video_ids)
2604                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2605                 playlistend = self._downloader.params.get('playlistend', -1)
2606
2607                 if playlistend == -1:
2608                         video_ids = video_ids[playliststart:]
2609                 else:
2610                         video_ids = video_ids[playliststart:playlistend]
2611
2612                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2613                                 (username, all_ids_count, len(video_ids)))
2614
2615                 for video_id in video_ids:
2616                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2617
2618
2619 class DepositFilesIE(InfoExtractor):
2620         """Information extractor for depositfiles.com"""
2621
2622         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2623         IE_NAME = u'DepositFiles'
2624
2625         def __init__(self, downloader=None):
2626                 InfoExtractor.__init__(self, downloader)
2627
2628         def report_download_webpage(self, file_id):
2629                 """Report webpage download."""
2630                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2631
2632         def report_extraction(self, file_id):
2633                 """Report information extraction."""
2634                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2635
2636         def _real_extract(self, url):
2637                 # At this point we have a new file
2638                 self._downloader.increment_downloads()
2639
2640                 file_id = url.split('/')[-1]
2641                 # Rebuild url in english locale
2642                 url = 'http://depositfiles.com/en/files/' + file_id
2643
2644                 # Retrieve file webpage with 'Free download' button pressed
2645                 free_download_indication = { 'gateway_result' : '1' }
2646                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2647                 try:
2648                         self.report_download_webpage(file_id)
2649                         webpage = urllib2.urlopen(request).read()
2650                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2651                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2652                         return
2653
2654                 # Search for the real file URL
2655                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2656                 if (mobj is None) or (mobj.group(1) is None):
2657                         # Try to figure out reason of the error.
2658                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2659                         if (mobj is not None) and (mobj.group(1) is not None):
2660                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2661                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2662                         else:
2663                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2664                         return
2665
2666                 file_url = mobj.group(1)
2667                 file_extension = os.path.splitext(file_url)[1][1:]
2668
2669                 # Search for file title
2670                 mobj = re.search(r'<b title="(.*?)">', webpage)
2671                 if mobj is None:
2672                         self._downloader.trouble(u'ERROR: unable to extract title')
2673                         return
2674                 file_title = mobj.group(1).decode('utf-8')
2675
2676                 try:
2677                         # Process file information
2678                         self._downloader.process_info({
2679                                 'id':           file_id.decode('utf-8'),
2680                                 'url':          file_url.decode('utf-8'),
2681                                 'uploader':     u'NA',
2682                                 'upload_date':  u'NA',
2683                                 'title':        file_title,
2684                                 'stitle':       file_title,
2685                                 'ext':          file_extension.decode('utf-8'),
2686                                 'format':       u'NA',
2687                                 'player_url':   None,
2688                         })
2689                 except UnavailableVideoError, err:
2690                         self._downloader.trouble(u'ERROR: unable to download file')
2691
2692
2693 class FacebookIE(InfoExtractor):
2694         """Information Extractor for Facebook"""
2695
2696         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2697         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2698         _NETRC_MACHINE = 'facebook'
2699         _available_formats = ['video', 'highqual', 'lowqual']
2700         _video_extensions = {
2701                 'video': 'mp4',
2702                 'highqual': 'mp4',
2703                 'lowqual': 'mp4',
2704         }
2705         IE_NAME = u'facebook'
2706
2707         def __init__(self, downloader=None):
2708                 InfoExtractor.__init__(self, downloader)
2709
2710         def _reporter(self, message):
2711                 """Add header and report message."""
2712                 self._downloader.to_screen(u'[facebook] %s' % message)
2713
2714         def report_login(self):
2715                 """Report attempt to log in."""
2716                 self._reporter(u'Logging in')
2717
2718         def report_video_webpage_download(self, video_id):
2719                 """Report attempt to download video webpage."""
2720                 self._reporter(u'%s: Downloading video webpage' % video_id)
2721
2722         def report_information_extraction(self, video_id):
2723                 """Report attempt to extract video information."""
2724                 self._reporter(u'%s: Extracting video information' % video_id)
2725
2726         def _parse_page(self, video_webpage):
2727                 """Extract video information from page"""
2728                 # General data
2729                 data = {'title': r'\("video_title", "(.*?)"\)',
2730                         'description': r'<div class="datawrap">(.*?)</div>',
2731                         'owner': r'\("video_owner_name", "(.*?)"\)',
2732                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2733                         }
2734                 video_info = {}
2735                 for piece in data.keys():
2736                         mobj = re.search(data[piece], video_webpage)
2737                         if mobj is not None:
2738                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2739
2740                 # Video urls
2741                 video_urls = {}
2742                 for fmt in self._available_formats:
2743                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2744                         if mobj is not None:
2745                                 # URL is in a Javascript segment inside an escaped Unicode format within
2746                                 # the generally utf-8 page
2747                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2748                 video_info['video_urls'] = video_urls
2749
2750                 return video_info
2751
2752         def _real_initialize(self):
2753                 if self._downloader is None:
2754                         return
2755
2756                 useremail = None
2757                 password = None
2758                 downloader_params = self._downloader.params
2759
2760                 # Attempt to use provided username and password or .netrc data
2761                 if downloader_params.get('username', None) is not None:
2762                         useremail = downloader_params['username']
2763                         password = downloader_params['password']
2764                 elif downloader_params.get('usenetrc', False):
2765                         try:
2766                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2767                                 if info is not None:
2768                                         useremail = info[0]
2769                                         password = info[2]
2770                                 else:
2771                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2772                         except (IOError, netrc.NetrcParseError), err:
2773                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2774                                 return
2775
2776                 if useremail is None:
2777                         return
2778
2779                 # Log in
2780                 login_form = {
2781                         'email': useremail,
2782                         'pass': password,
2783                         'login': 'Log+In'
2784                         }
2785                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2786                 try:
2787                         self.report_login()
2788                         login_results = urllib2.urlopen(request).read()
2789                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2790                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2791                                 return
2792                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2793                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2794                         return
2795
2796         def _real_extract(self, url):
2797                 mobj = re.match(self._VALID_URL, url)
2798                 if mobj is None:
2799                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2800                         return
2801                 video_id = mobj.group('ID')
2802
2803                 # Get video webpage
2804                 self.report_video_webpage_download(video_id)
2805                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2806                 try:
2807                         page = urllib2.urlopen(request)
2808                         video_webpage = page.read()
2809                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2810                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2811                         return
2812
2813                 # Start extracting information
2814                 self.report_information_extraction(video_id)
2815
2816                 # Extract information
2817                 video_info = self._parse_page(video_webpage)
2818
2819                 # uploader
2820                 if 'owner' not in video_info:
2821                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2822                         return
2823                 video_uploader = video_info['owner']
2824
2825                 # title
2826                 if 'title' not in video_info:
2827                         self._downloader.trouble(u'ERROR: unable to extract video title')
2828                         return
2829                 video_title = video_info['title']
2830                 video_title = video_title.decode('utf-8')
2831                 video_title = sanitize_title(video_title)
2832
2833                 # simplified title
2834                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2835                 simple_title = simple_title.strip(ur'_')
2836
2837                 # thumbnail image
2838                 if 'thumbnail' not in video_info:
2839                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2840                         video_thumbnail = ''
2841                 else:
2842                         video_thumbnail = video_info['thumbnail']
2843
2844                 # upload date
2845                 upload_date = u'NA'
2846                 if 'upload_date' in video_info:
2847                         upload_time = video_info['upload_date']
2848                         timetuple = email.utils.parsedate_tz(upload_time)
2849                         if timetuple is not None:
2850                                 try:
2851                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2852                                 except:
2853                                         pass
2854
2855                 # description
2856                 video_description = video_info.get('description', 'No description available.')
2857
2858                 url_map = video_info['video_urls']
2859                 if len(url_map.keys()) > 0:
2860                         # Decide which formats to download
2861                         req_format = self._downloader.params.get('format', None)
2862                         format_limit = self._downloader.params.get('format_limit', None)
2863
2864                         if format_limit is not None and format_limit in self._available_formats:
2865                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2866                         else:
2867                                 format_list = self._available_formats
2868                         existing_formats = [x for x in format_list if x in url_map]
2869                         if len(existing_formats) == 0:
2870                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2871                                 return
2872                         if req_format is None:
2873                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2874                         elif req_format == 'worst':
2875                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2876                         elif req_format == '-1':
2877                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2878                         else:
2879                                 # Specific format
2880                                 if req_format not in url_map:
2881                                         self._downloader.trouble(u'ERROR: requested format not available')
2882                                         return
2883                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2884
2885                 for format_param, video_real_url in video_url_list:
2886
2887                         # At this point we have a new video
2888                         self._downloader.increment_downloads()
2889
2890                         # Extension
2891                         video_extension = self._video_extensions.get(format_param, 'mp4')
2892
2893                         try:
2894                                 # Process video information
2895                                 self._downloader.process_info({
2896                                         'id':           video_id.decode('utf-8'),
2897                                         'url':          video_real_url.decode('utf-8'),
2898                                         'uploader':     video_uploader.decode('utf-8'),
2899                                         'upload_date':  upload_date,
2900                                         'title':        video_title,
2901                                         'stitle':       simple_title,
2902                                         'ext':          video_extension.decode('utf-8'),
2903                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2904                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2905                                         'description':  video_description.decode('utf-8'),
2906                                         'player_url':   None,
2907                                 })
2908                         except UnavailableVideoError, err:
2909                                 self._downloader.trouble(u'\nERROR: unable to download video')
2910
2911 class BlipTVIE(InfoExtractor):
2912         """Information extractor for blip.tv"""
2913
2914         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2915         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2916         IE_NAME = u'blip.tv'
2917
2918         def report_extraction(self, file_id):
2919                 """Report information extraction."""
2920                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2921
2922         def report_direct_download(self, title):
2923                 """Report information extraction."""
2924                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2925
2926         def _simplify_title(self, title):
2927                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2928                 res = res.strip(ur'_')
2929                 return res
2930
2931         def _real_extract(self, url):
2932                 mobj = re.match(self._VALID_URL, url)
2933                 if mobj is None:
2934                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2935                         return
2936
2937                 if '?' in url:
2938                         cchar = '&'
2939                 else:
2940                         cchar = '?'
2941                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2942                 request = urllib2.Request(json_url)
2943                 self.report_extraction(mobj.group(1))
2944                 info = None
2945                 try:
2946                         urlh = urllib2.urlopen(request)
2947                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2948                                 basename = url.split('/')[-1]
2949                                 title,ext = os.path.splitext(basename)
2950                                 ext = ext.replace('.', '')
2951                                 self.report_direct_download(title)
2952                                 info = {
2953                                         'id': title,
2954                                         'url': url,
2955                                         'title': title,
2956                                         'stitle': self._simplify_title(title),
2957                                         'ext': ext,
2958                                         'urlhandle': urlh
2959                                 }
2960                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2961                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2962                         return
2963                 if info is None: # Regular URL
2964                         try:
2965                                 json_code = urlh.read()
2966                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2967                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2968                                 return
2969
2970                         try:
2971                                 json_data = json.loads(json_code)
2972                                 if 'Post' in json_data:
2973                                         data = json_data['Post']
2974                                 else:
2975                                         data = json_data
2976         
2977                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2978                                 video_url = data['media']['url']
2979                                 umobj = re.match(self._URL_EXT, video_url)
2980                                 if umobj is None:
2981                                         raise ValueError('Can not determine filename extension')
2982                                 ext = umobj.group(1)
2983         
2984                                 info = {
2985                                         'id': data['item_id'],
2986                                         'url': video_url,
2987                                         'uploader': data['display_name'],
2988                                         'upload_date': upload_date,
2989                                         'title': data['title'],
2990                                         'stitle': self._simplify_title(data['title']),
2991                                         'ext': ext,
2992                                         'format': data['media']['mimeType'],
2993                                         'thumbnail': data['thumbnailUrl'],
2994                                         'description': data['description'],
2995                                         'player_url': data['embedUrl']
2996                                 }
2997                         except (ValueError,KeyError), err:
2998                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2999                                 return
3000
3001                 self._downloader.increment_downloads()
3002
3003                 try:
3004                         self._downloader.process_info(info)
3005                 except UnavailableVideoError, err:
3006                         self._downloader.trouble(u'\nERROR: unable to download video')
3007
3008
3009 class MyVideoIE(InfoExtractor):
3010         """Information Extractor for myvideo.de."""
3011
3012         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3013         IE_NAME = u'myvideo'
3014
3015         def __init__(self, downloader=None):
3016                 InfoExtractor.__init__(self, downloader)
3017         
3018         def report_download_webpage(self, video_id):
3019                 """Report webpage download."""
3020                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3021
3022         def report_extraction(self, video_id):
3023                 """Report information extraction."""
3024                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3025
3026         def _real_extract(self,url):
3027                 mobj = re.match(self._VALID_URL, url)
3028                 if mobj is None:
3029                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3030                         return
3031
3032                 video_id = mobj.group(1)
3033                 simple_title = mobj.group(2).decode('utf-8')
3034                 # should actually not be necessary
3035                 simple_title = sanitize_title(simple_title)
3036                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3037
3038                 # Get video webpage
3039                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3040                 try:
3041                         self.report_download_webpage(video_id)
3042                         webpage = urllib2.urlopen(request).read()
3043                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3044                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3045                         return
3046
3047                 self.report_extraction(video_id)
3048                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3049                                  webpage)
3050                 if mobj is None:
3051                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3052                         return
3053                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3054
3055                 mobj = re.search('<title>([^<]+)</title>', webpage)
3056                 if mobj is None:
3057                         self._downloader.trouble(u'ERROR: unable to extract title')
3058                         return
3059
3060                 video_title = mobj.group(1)
3061                 video_title = sanitize_title(video_title)
3062
3063                 try:
3064                         self._downloader.process_info({
3065                                 'id':           video_id,
3066                                 'url':          video_url,
3067                                 'uploader':     u'NA',
3068                                 'upload_date':  u'NA',
3069                                 'title':        video_title,
3070                                 'stitle':       simple_title,
3071                                 'ext':          u'flv',
3072                                 'format':       u'NA',
3073                                 'player_url':   None,
3074                         })
3075                 except UnavailableVideoError:
3076                         self._downloader.trouble(u'\nERROR: Unable to download video')
3077
3078 class ComedyCentralIE(InfoExtractor):
3079         """Information extractor for The Daily Show and Colbert Report """
3080
3081         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3082         IE_NAME = u'comedycentral'
3083
3084         def report_extraction(self, episode_id):
3085                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3086         
3087         def report_config_download(self, episode_id):
3088                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3089
3090         def report_index_download(self, episode_id):
3091                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3092
3093         def report_player_url(self, episode_id):
3094                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3095
3096         def _simplify_title(self, title):
3097                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3098                 res = res.strip(ur'_')
3099                 return res
3100
3101         def _real_extract(self, url):
3102                 mobj = re.match(self._VALID_URL, url)
3103                 if mobj is None:
3104                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3105                         return
3106
3107                 if mobj.group('shortname'):
3108                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3109                                 url = 'http://www.thedailyshow.com/full-episodes/'
3110                         else:
3111                                 url = 'http://www.colbertnation.com/full-episodes/'
3112                         mobj = re.match(self._VALID_URL, url)
3113                         assert mobj is not None
3114
3115                 dlNewest = not mobj.group('episode')
3116                 if dlNewest:
3117                         epTitle = mobj.group('showname')
3118                 else:
3119                         epTitle = mobj.group('episode')
3120
3121                 req = urllib2.Request(url)
3122                 self.report_extraction(epTitle)
3123                 try:
3124                         htmlHandle = urllib2.urlopen(req)
3125                         html = htmlHandle.read()
3126                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3127                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3128                         return
3129                 if dlNewest:
3130                         url = htmlHandle.geturl()
3131                         mobj = re.match(self._VALID_URL, url)
3132                         if mobj is None:
3133                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3134                                 return
3135                         if mobj.group('episode') == '':
3136                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3137                                 return
3138                         epTitle = mobj.group('episode')
3139
3140                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3141                 if len(mMovieParams) == 0:
3142                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3143                         return
3144
3145                 playerUrl_raw = mMovieParams[0][0]
3146                 self.report_player_url(epTitle)
3147                 try:
3148                         urlHandle = urllib2.urlopen(playerUrl_raw)
3149                         playerUrl = urlHandle.geturl()
3150                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3151                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3152                         return
3153
3154                 uri = mMovieParams[0][1]
3155                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3156                 self.report_index_download(epTitle)
3157                 try:
3158                         indexXml = urllib2.urlopen(indexUrl).read()
3159                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3160                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3161                         return
3162
3163                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3164                 itemEls = idoc.findall('.//item')
3165                 for itemEl in itemEls:
3166                         mediaId = itemEl.findall('./guid')[0].text
3167                         shortMediaId = mediaId.split(':')[-1]
3168                         showId = mediaId.split(':')[-2].replace('.com', '')
3169                         officialTitle = itemEl.findall('./title')[0].text
3170                         officialDate = itemEl.findall('./pubDate')[0].text
3171
3172                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3173                                                 urllib.urlencode({'uri': mediaId}))
3174                         configReq = urllib2.Request(configUrl)
3175                         self.report_config_download(epTitle)
3176                         try:
3177                                 configXml = urllib2.urlopen(configReq).read()
3178                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3179                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3180                                 return
3181
3182                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3183                         turls = []
3184                         for rendition in cdoc.findall('.//rendition'):
3185                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3186                                 turls.append(finfo)
3187
3188                         if len(turls) == 0:
3189                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3190                                 continue
3191
3192                         # For now, just pick the highest bitrate
3193                         format,video_url = turls[-1]
3194
3195                         self._downloader.increment_downloads()
3196
3197                         effTitle = showId + '-' + epTitle
3198                         info = {
3199                                 'id': shortMediaId,
3200                                 'url': video_url,
3201                                 'uploader': showId,
3202                                 'upload_date': officialDate,
3203                                 'title': effTitle,
3204                                 'stitle': self._simplify_title(effTitle),
3205                                 'ext': 'mp4',
3206                                 'format': format,
3207                                 'thumbnail': None,
3208                                 'description': officialTitle,
3209                                 'player_url': playerUrl
3210                         }
3211
3212                         try:
3213                                 self._downloader.process_info(info)
3214                         except UnavailableVideoError, err:
3215                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3216                                 continue
3217
3218
3219 class EscapistIE(InfoExtractor):
3220         """Information extractor for The Escapist """
3221
3222         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3223         IE_NAME = u'escapist'
3224
3225         def report_extraction(self, showName):
3226                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3227
3228         def report_config_download(self, showName):
3229                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3230
3231         def _simplify_title(self, title):
3232                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3233                 res = res.strip(ur'_')
3234                 return res
3235
3236         def _real_extract(self, url):
3237                 htmlParser = HTMLParser.HTMLParser()
3238
3239                 mobj = re.match(self._VALID_URL, url)
3240                 if mobj is None:
3241                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3242                         return
3243                 showName = mobj.group('showname')
3244                 videoId = mobj.group('episode')
3245
3246                 self.report_extraction(showName)
3247                 try:
3248                         webPage = urllib2.urlopen(url).read()
3249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3250                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3251                         return
3252
3253                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3254                 description = htmlParser.unescape(descMatch.group(1))
3255                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3256                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3257                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3258                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3259                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3260                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3261
3262                 self.report_config_download(showName)
3263                 try:
3264                         configJSON = urllib2.urlopen(configUrl).read()
3265                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3266                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3267                         return
3268
3269                 # Technically, it's JavaScript, not JSON
3270                 configJSON = configJSON.replace("'", '"')
3271
3272                 try:
3273                         config = json.loads(configJSON)
3274                 except (ValueError,), err:
3275                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3276                         return
3277
3278                 playlist = config['playlist']
3279                 videoUrl = playlist[1]['url']
3280
3281                 self._downloader.increment_downloads()
3282                 info = {
3283                         'id': videoId,
3284                         'url': videoUrl,
3285                         'uploader': showName,
3286                         'upload_date': None,
3287                         'title': showName,
3288                         'stitle': self._simplify_title(showName),
3289                         'ext': 'flv',
3290                         'format': 'flv',
3291                         'thumbnail': imgUrl,
3292                         'description': description,
3293                         'player_url': playerUrl,
3294                 }
3295
3296                 try:
3297                         self._downloader.process_info(info)
3298                 except UnavailableVideoError, err:
3299                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3300
3301
3302 class CollegeHumorIE(InfoExtractor):
3303         """Information extractor for collegehumor.com"""
3304
3305         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3306         IE_NAME = u'collegehumor'
3307
3308         def report_webpage(self, video_id):
3309                 """Report information extraction."""
3310                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3311
3312         def report_extraction(self, video_id):
3313                 """Report information extraction."""
3314                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3315
3316         def _simplify_title(self, title):
3317                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3318                 res = res.strip(ur'_')
3319                 return res
3320
3321         def _real_extract(self, url):
3322                 htmlParser = HTMLParser.HTMLParser()
3323
3324                 mobj = re.match(self._VALID_URL, url)
3325                 if mobj is None:
3326                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3327                         return
3328                 video_id = mobj.group('videoid')
3329
3330                 self.report_webpage(video_id)
3331                 request = urllib2.Request(url)
3332                 try:
3333                         webpage = urllib2.urlopen(request).read()
3334                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3335                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3336                         return
3337
3338                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3339                 if m is None:
3340                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3341                         return
3342                 internal_video_id = m.group('internalvideoid')
3343
3344                 info = {
3345                         'id': video_id,
3346                         'internal_id': internal_video_id,
3347                 }
3348
3349                 self.report_extraction(video_id)
3350                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3351                 try:
3352                         metaXml = urllib2.urlopen(xmlUrl).read()
3353                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3354                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3355                         return
3356
3357                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3358                 try:
3359                         videoNode = mdoc.findall('./video')[0]
3360                         info['description'] = videoNode.findall('./description')[0].text
3361                         info['title'] = videoNode.findall('./caption')[0].text
3362                         info['stitle'] = self._simplify_title(info['title'])
3363                         info['url'] = videoNode.findall('./file')[0].text
3364                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3365                         info['ext'] = info['url'].rpartition('.')[2]
3366                         info['format'] = info['ext']
3367                 except IndexError:
3368                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3369                         return
3370
3371                 self._downloader.increment_downloads()
3372
3373                 try:
3374                         self._downloader.process_info(info)
3375                 except UnavailableVideoError, err:
3376                         self._downloader.trouble(u'\nERROR: unable to download video')
3377
3378
3379 class XVideosIE(InfoExtractor):
3380         """Information extractor for xvideos.com"""
3381
3382         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3383         IE_NAME = u'xvideos'
3384
3385         def report_webpage(self, video_id):
3386                 """Report information extraction."""
3387                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3388
3389         def report_extraction(self, video_id):
3390                 """Report information extraction."""
3391                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3392
3393         def _simplify_title(self, title):
3394                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3395                 res = res.strip(ur'_')
3396                 return res
3397
3398         def _real_extract(self, url):
3399                 htmlParser = HTMLParser.HTMLParser()
3400
3401                 mobj = re.match(self._VALID_URL, url)
3402                 if mobj is None:
3403                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3404                         return
3405                 video_id = mobj.group(1).decode('utf-8')
3406
3407                 self.report_webpage(video_id)
3408
3409                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3410                 try:
3411                         webpage = urllib2.urlopen(request).read()
3412                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3413                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3414                         return
3415
3416                 self.report_extraction(video_id)
3417
3418
3419                 # Extract video URL
3420                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3421                 if mobj is None:
3422                         self._downloader.trouble(u'ERROR: unable to extract video url')
3423                         return
3424                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3425
3426
3427                 # Extract title
3428                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3429                 if mobj is None:
3430                         self._downloader.trouble(u'ERROR: unable to extract video title')
3431                         return
3432                 video_title = mobj.group(1).decode('utf-8')
3433
3434
3435                 # Extract video thumbnail
3436                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3437                 if mobj is None:
3438                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3439                         return
3440                 video_thumbnail = mobj.group(1).decode('utf-8')
3441
3442
3443
3444                 self._downloader.increment_downloads()
3445                 info = {
3446                         'id': video_id,
3447                         'url': video_url,
3448                         'uploader': None,
3449                         'upload_date': None,
3450                         'title': video_title,
3451                         'stitle': self._simplify_title(video_title),
3452                         'ext': 'flv',
3453                         'format': 'flv',
3454                         'thumbnail': video_thumbnail,
3455                         'description': None,
3456                         'player_url': None,
3457                 }
3458
3459                 try:
3460                         self._downloader.process_info(info)
3461                 except UnavailableVideoError, err:
3462                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3463
3464
3465 class SoundcloudIE(InfoExtractor):
3466         """Information extractor for soundcloud.com
3467            To access the media, the uid of the song and a stream token
3468            must be extracted from the page source and the script must make
3469            a request to media.soundcloud.com/crossdomain.xml. Then
3470            the media can be grabbed by requesting from an url composed
3471            of the stream token and uid
3472          """
3473
3474         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3475         IE_NAME = u'soundcloud'
3476
3477         def __init__(self, downloader=None):
3478                 InfoExtractor.__init__(self, downloader)
3479
3480         def report_webpage(self, video_id):
3481                 """Report information extraction."""
3482                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3483
3484         def report_extraction(self, video_id):
3485                 """Report information extraction."""
3486                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3487
3488         def _real_extract(self, url):
3489                 htmlParser = HTMLParser.HTMLParser()
3490
3491                 mobj = re.match(self._VALID_URL, url)
3492                 if mobj is None:
3493                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3494                         return
3495
3496                 # extract uploader (which is in the url)
3497                 uploader = mobj.group(1).decode('utf-8')
3498                 # extract simple title (uploader + slug of song title)
3499                 slug_title =  mobj.group(2).decode('utf-8')
3500                 simple_title = uploader + '-' + slug_title
3501
3502                 self.report_webpage('%s/%s' % (uploader, slug_title))
3503
3504                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3505                 try:
3506                         webpage = urllib2.urlopen(request).read()
3507                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3508                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3509                         return
3510
3511                 self.report_extraction('%s/%s' % (uploader, slug_title))
3512
3513                 # extract uid and stream token that soundcloud hands out for access
3514                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3515                 if mobj:
3516                         video_id = mobj.group(1)
3517                         stream_token = mobj.group(2)
3518
3519                 # extract unsimplified title
3520                 mobj = re.search('"title":"(.*?)",', webpage)
3521                 if mobj:
3522                         title = mobj.group(1)
3523
3524                 # construct media url (with uid/token)
3525                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3526                 mediaURL = mediaURL % (video_id, stream_token)
3527
3528                 # description
3529                 description = u'No description available'
3530                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3531                 if mobj:
3532                         description = mobj.group(1)
3533                 
3534                 # upload date
3535                 upload_date = None
3536                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3537                 if mobj:
3538                         try:
3539                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3540                         except Exception as e:
3541                                 print str(e)
3542
3543                 # for soundcloud, a request to a cross domain is required for cookies
3544                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3545
3546                 try:
3547                         self._downloader.process_info({
3548                                 'id':           video_id.decode('utf-8'),
3549                                 'url':          mediaURL,
3550                                 'uploader':     uploader.decode('utf-8'),
3551                                 'upload_date':  upload_date,
3552                                 'title':        simple_title.decode('utf-8'),
3553                                 'stitle':       simple_title.decode('utf-8'),
3554                                 'ext':          u'mp3',
3555                                 'format':       u'NA',
3556                                 'player_url':   None,
3557                                 'description': description.decode('utf-8')
3558                         })
3559                 except UnavailableVideoError:
3560                         self._downloader.trouble(u'\nERROR: unable to download video')
3561
3562
3563 class InfoQIE(InfoExtractor):
3564         """Information extractor for infoq.com"""
3565
3566         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3567         IE_NAME = u'infoq'
3568
3569         def report_webpage(self, video_id):
3570                 """Report information extraction."""
3571                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3572
3573         def report_extraction(self, video_id):
3574                 """Report information extraction."""
3575                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3576
3577         def _simplify_title(self, title):
3578                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3579                 res = res.strip(ur'_')
3580                 return res
3581
3582         def _real_extract(self, url):
3583                 htmlParser = HTMLParser.HTMLParser()
3584
3585                 mobj = re.match(self._VALID_URL, url)
3586                 if mobj is None:
3587                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3588                         return
3589
3590                 self.report_webpage(url)
3591
3592                 request = urllib2.Request(url)
3593                 try:
3594                         webpage = urllib2.urlopen(request).read()
3595                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3596                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3597                         return
3598
3599                 self.report_extraction(url)
3600
3601
3602                 # Extract video URL
3603                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3604                 if mobj is None:
3605                         self._downloader.trouble(u'ERROR: unable to extract video url')
3606                         return
3607                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3608
3609
3610                 # Extract title
3611                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3612                 if mobj is None:
3613                         self._downloader.trouble(u'ERROR: unable to extract video title')
3614                         return
3615                 video_title = mobj.group(1).decode('utf-8')
3616
3617                 # Extract description
3618                 video_description = u'No description available.'
3619                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3620                 if mobj is not None:
3621                         video_description = mobj.group(1).decode('utf-8')
3622
3623                 video_filename = video_url.split('/')[-1]
3624                 video_id, extension = video_filename.split('.')
3625
3626                 self._downloader.increment_downloads()
3627                 info = {
3628                         'id': video_id,
3629                         'url': video_url,
3630                         'uploader': None,
3631                         'upload_date': None,
3632                         'title': video_title,
3633                         'stitle': self._simplify_title(video_title),
3634                         'ext': extension,
3635                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3636                         'thumbnail': None,
3637                         'description': video_description,
3638                         'player_url': None,
3639                 }
3640
3641                 try:
3642                         self._downloader.process_info(info)
3643                 except UnavailableVideoError, err:
3644                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3645
3646
3647
3648 class PostProcessor(object):
3649         """Post Processor class.
3650
3651         PostProcessor objects can be added to downloaders with their
3652         add_post_processor() method. When the downloader has finished a
3653         successful download, it will take its internal chain of PostProcessors
3654         and start calling the run() method on each one of them, first with
3655         an initial argument and then with the returned value of the previous
3656         PostProcessor.
3657
3658         The chain will be stopped if one of them ever returns None or the end
3659         of the chain is reached.
3660
3661         PostProcessor objects follow a "mutual registration" process similar
3662         to InfoExtractor objects.
3663         """
3664
3665         _downloader = None
3666
3667         def __init__(self, downloader=None):
3668                 self._downloader = downloader
3669
3670         def set_downloader(self, downloader):
3671                 """Sets the downloader for this PP."""
3672                 self._downloader = downloader
3673
3674         def run(self, information):
3675                 """Run the PostProcessor.
3676
3677                 The "information" argument is a dictionary like the ones
3678                 composed by InfoExtractors. The only difference is that this
3679                 one has an extra field called "filepath" that points to the
3680                 downloaded file.
3681
3682                 When this method returns None, the postprocessing chain is
3683                 stopped. However, this method may return an information
3684                 dictionary that will be passed to the next postprocessing
3685                 object in the chain. It can be the one it received after
3686                 changing some fields.
3687
3688                 In addition, this method may raise a PostProcessingError
3689                 exception that will be taken into account by the downloader
3690                 it was called from.
3691                 """
3692                 return information # by default, do nothing
3693
3694
3695 class FFmpegExtractAudioPP(PostProcessor):
3696
3697         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3698                 PostProcessor.__init__(self, downloader)
3699                 if preferredcodec is None:
3700                         preferredcodec = 'best'
3701                 self._preferredcodec = preferredcodec
3702                 self._preferredquality = preferredquality
3703                 self._keepvideo = keepvideo
3704
3705         @staticmethod
3706         def get_audio_codec(path):
3707                 try:
3708                         cmd = ['ffprobe', '-show_streams', '--', path]
3709                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3710                         output = handle.communicate()[0]
3711                         if handle.wait() != 0:
3712                                 return None
3713                 except (IOError, OSError):
3714                         return None
3715                 audio_codec = None
3716                 for line in output.split('\n'):
3717                         if line.startswith('codec_name='):
3718                                 audio_codec = line.split('=')[1].strip()
3719                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3720                                 return audio_codec
3721                 return None
3722
3723         @staticmethod
3724         def run_ffmpeg(path, out_path, codec, more_opts):
3725                 try:
3726                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3727                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3728                         return (ret == 0)
3729                 except (IOError, OSError):
3730                         return False
3731
3732         def run(self, information):
3733                 path = information['filepath']
3734
3735                 filecodec = self.get_audio_codec(path)
3736                 if filecodec is None:
3737                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3738                         return None
3739
3740                 more_opts = []
3741                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3742                         if filecodec in ['aac', 'mp3', 'vorbis']:
3743                                 # Lossless if possible
3744                                 acodec = 'copy'
3745                                 extension = filecodec
3746                                 if filecodec == 'aac':
3747                                         more_opts = ['-f', 'adts']
3748                                 if filecodec == 'vorbis':
3749                                         extension = 'ogg'
3750                         else:
3751                                 # MP3 otherwise.
3752                                 acodec = 'libmp3lame'
3753                                 extension = 'mp3'
3754                                 more_opts = []
3755                                 if self._preferredquality is not None:
3756                                         more_opts += ['-ab', self._preferredquality]
3757                 else:
3758                         # We convert the audio (lossy)
3759                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3760                         extension = self._preferredcodec
3761                         more_opts = []
3762                         if self._preferredquality is not None:
3763                                 more_opts += ['-ab', self._preferredquality]
3764                         if self._preferredcodec == 'aac':
3765                                 more_opts += ['-f', 'adts']
3766                         if self._preferredcodec == 'vorbis':
3767                                 extension = 'ogg'
3768
3769                 (prefix, ext) = os.path.splitext(path)
3770                 new_path = prefix + '.' + extension
3771                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3772                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3773
3774                 if not status:
3775                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3776                         return None
3777
3778                 # Try to update the date time for extracted audio file.
3779                 if information.get('filetime') is not None:
3780                         try:
3781                                 os.utime(new_path, (time.time(), information['filetime']))
3782                         except:
3783                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3784
3785                 if not self._keepvideo:
3786                         try:
3787                                 os.remove(path)
3788                         except (IOError, OSError):
3789                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3790                                 return None
3791
3792                 information['filepath'] = new_path
3793                 return information
3794
3795
3796 def updateSelf(downloader, filename):
3797         ''' Update the program file with the latest version from the repository '''
3798         # Note: downloader only used for options
3799         if not os.access(filename, os.W_OK):
3800                 sys.exit('ERROR: no write permissions on %s' % filename)
3801
3802         downloader.to_screen('Updating to latest version...')
3803
3804         try:
3805                 try:
3806                         urlh = urllib.urlopen(UPDATE_URL)
3807                         newcontent = urlh.read()
3808                         
3809                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3810                         if vmatch is not None and vmatch.group(1) == __version__:
3811                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3812                                 return
3813                 finally:
3814                         urlh.close()
3815         except (IOError, OSError), err:
3816                 sys.exit('ERROR: unable to download latest version')
3817
3818         try:
3819                 outf = open(filename, 'wb')
3820                 try:
3821                         outf.write(newcontent)
3822                 finally:
3823                         outf.close()
3824         except (IOError, OSError), err:
3825                 sys.exit('ERROR: unable to overwrite current version')
3826
3827         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3828
3829 def parseOpts():
3830         # Deferred imports
3831         import getpass
3832         import optparse
3833
3834         def _format_option_string(option):
3835                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3836
3837                 opts = []
3838
3839                 if option._short_opts: opts.append(option._short_opts[0])
3840                 if option._long_opts: opts.append(option._long_opts[0])
3841                 if len(opts) > 1: opts.insert(1, ', ')
3842
3843                 if option.takes_value(): opts.append(' %s' % option.metavar)
3844
3845                 return "".join(opts)
3846
3847         def _find_term_columns():
3848                 columns = os.environ.get('COLUMNS', None)
3849                 if columns:
3850                         return int(columns)
3851
3852                 try:
3853                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3854                         out,err = sp.communicate()
3855                         return int(out.split()[1])
3856                 except:
3857                         pass
3858                 return None
3859
3860         max_width = 80
3861         max_help_position = 80
3862
3863         # No need to wrap help messages if we're on a wide console
3864         columns = _find_term_columns()
3865         if columns: max_width = columns
3866
3867         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3868         fmt.format_option_strings = _format_option_string
3869
3870         kw = {
3871                 'version'   : __version__,
3872                 'formatter' : fmt,
3873                 'usage' : '%prog [options] url [url...]',
3874                 'conflict_handler' : 'resolve',
3875         }
3876
3877         parser = optparse.OptionParser(**kw)
3878
3879         # option groups
3880         general        = optparse.OptionGroup(parser, 'General Options')
3881         selection      = optparse.OptionGroup(parser, 'Video Selection')
3882         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3883         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3884         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3885         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3886         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3887
3888         general.add_option('-h', '--help',
3889                         action='help', help='print this help text and exit')
3890         general.add_option('-v', '--version',
3891                         action='version', help='print program version and exit')
3892         general.add_option('-U', '--update',
3893                         action='store_true', dest='update_self', help='update this program to latest version')
3894         general.add_option('-i', '--ignore-errors',
3895                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3896         general.add_option('-r', '--rate-limit',
3897                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3898         general.add_option('-R', '--retries',
3899                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3900         general.add_option('--dump-user-agent',
3901                         action='store_true', dest='dump_user_agent',
3902                         help='display the current browser identification', default=False)
3903         general.add_option('--list-extractors',
3904                         action='store_true', dest='list_extractors',
3905                         help='List all supported extractors and the URLs they would handle', default=False)
3906
3907         selection.add_option('--playlist-start',
3908                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3909         selection.add_option('--playlist-end',
3910                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3911         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3912         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3913
3914         authentication.add_option('-u', '--username',
3915                         dest='username', metavar='USERNAME', help='account username')
3916         authentication.add_option('-p', '--password',
3917                         dest='password', metavar='PASSWORD', help='account password')
3918         authentication.add_option('-n', '--netrc',
3919                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3920
3921
3922         video_format.add_option('-f', '--format',
3923                         action='store', dest='format', metavar='FORMAT', help='video format code')
3924         video_format.add_option('--all-formats',
3925                         action='store_const', dest='format', help='download all available video formats', const='all')
3926         video_format.add_option('--max-quality',
3927                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3928         video_format.add_option('-F', '--list-formats',
3929                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3930
3931
3932         verbosity.add_option('-q', '--quiet',
3933                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3934         verbosity.add_option('-s', '--simulate',
3935                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3936         verbosity.add_option('--skip-download',
3937                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3938         verbosity.add_option('-g', '--get-url',
3939                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3940         verbosity.add_option('-e', '--get-title',
3941                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3942         verbosity.add_option('--get-thumbnail',
3943                         action='store_true', dest='getthumbnail',
3944                         help='simulate, quiet but print thumbnail URL', default=False)
3945         verbosity.add_option('--get-description',
3946                         action='store_true', dest='getdescription',
3947                         help='simulate, quiet but print video description', default=False)
3948         verbosity.add_option('--get-filename',
3949                         action='store_true', dest='getfilename',
3950                         help='simulate, quiet but print output filename', default=False)
3951         verbosity.add_option('--get-format',
3952                         action='store_true', dest='getformat',
3953                         help='simulate, quiet but print output format', default=False)
3954         verbosity.add_option('--no-progress',
3955                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3956         verbosity.add_option('--console-title',
3957                         action='store_true', dest='consoletitle',
3958                         help='display progress in console titlebar', default=False)
3959
3960
3961         filesystem.add_option('-t', '--title',
3962                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3963         filesystem.add_option('-l', '--literal',
3964                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3965         filesystem.add_option('-A', '--auto-number',
3966                         action='store_true', dest='autonumber',
3967                         help='number downloaded files starting from 00000', default=False)
3968         filesystem.add_option('-o', '--output',
3969                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3970         filesystem.add_option('-a', '--batch-file',
3971                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3972         filesystem.add_option('-w', '--no-overwrites',
3973                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3974         filesystem.add_option('-c', '--continue',
3975                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3976         filesystem.add_option('--no-continue',
3977                         action='store_false', dest='continue_dl',
3978                         help='do not resume partially downloaded files (restart from beginning)')
3979         filesystem.add_option('--cookies',
3980                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3981         filesystem.add_option('--no-part',
3982                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3983         filesystem.add_option('--no-mtime',
3984                         action='store_false', dest='updatetime',
3985                         help='do not use the Last-modified header to set the file modification time', default=True)
3986         filesystem.add_option('--write-description',
3987                         action='store_true', dest='writedescription',
3988                         help='write video description to a .description file', default=False)
3989         filesystem.add_option('--write-info-json',
3990                         action='store_true', dest='writeinfojson',
3991                         help='write video metadata to a .info.json file', default=False)
3992
3993
3994         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3995                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3996         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3997                         help='"best", "aac", "vorbis" or "mp3"; best by default')
3998         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3999                         help='ffmpeg audio bitrate specification, 128k by default')
4000         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4001                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4002
4003
4004         parser.add_option_group(general)
4005         parser.add_option_group(selection)
4006         parser.add_option_group(filesystem)
4007         parser.add_option_group(verbosity)
4008         parser.add_option_group(video_format)
4009         parser.add_option_group(authentication)
4010         parser.add_option_group(postproc)
4011
4012         opts, args = parser.parse_args()
4013
4014         return parser, opts, args
4015
4016 def gen_extractors():
4017         """ Return a list of an instance of every supported extractor.
4018         The order does matter; the first extractor matched is the one handling the URL.
4019         """
4020         youtube_ie = YoutubeIE()
4021         google_ie = GoogleIE()
4022         yahoo_ie = YahooIE()
4023         return [
4024                 YoutubePlaylistIE(youtube_ie),
4025                 YoutubeUserIE(youtube_ie),
4026                 YoutubeSearchIE(youtube_ie),
4027                 youtube_ie,
4028                 MetacafeIE(youtube_ie),
4029                 DailymotionIE(),
4030                 google_ie,
4031                 GoogleSearchIE(google_ie),
4032                 PhotobucketIE(),
4033                 yahoo_ie,
4034                 YahooSearchIE(yahoo_ie),
4035                 DepositFilesIE(),
4036                 FacebookIE(),
4037                 BlipTVIE(),
4038                 VimeoIE(),
4039                 MyVideoIE(),
4040                 ComedyCentralIE(),
4041                 EscapistIE(),
4042                 CollegeHumorIE(),
4043                 XVideosIE(),
4044                 SoundcloudIE(),
4045                 InfoQIE(),
4046
4047                 GenericIE()
4048         ]
4049
4050 def _real_main():
4051         parser, opts, args = parseOpts()
4052
4053         # Open appropriate CookieJar
4054         if opts.cookiefile is None:
4055                 jar = cookielib.CookieJar()
4056         else:
4057                 try:
4058                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4059                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4060                                 jar.load()
4061                 except (IOError, OSError), err:
4062                         sys.exit(u'ERROR: unable to open cookie file')
4063
4064         # Dump user agent
4065         if opts.dump_user_agent:
4066                 print std_headers['User-Agent']
4067                 sys.exit(0)
4068
4069         # Batch file verification
4070         batchurls = []
4071         if opts.batchfile is not None:
4072                 try:
4073                         if opts.batchfile == '-':
4074                                 batchfd = sys.stdin
4075                         else:
4076                                 batchfd = open(opts.batchfile, 'r')
4077                         batchurls = batchfd.readlines()
4078                         batchurls = [x.strip() for x in batchurls]
4079                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4080                 except IOError:
4081                         sys.exit(u'ERROR: batch file could not be read')
4082         all_urls = batchurls + args
4083
4084         # General configuration
4085         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4086         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4087         urllib2.install_opener(opener)
4088         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4089
4090         extractors = gen_extractors()
4091
4092         if opts.list_extractors:
4093                 for ie in extractors:
4094                         print(ie.IE_NAME)
4095                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4096                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4097                         for mu in matchedUrls:
4098                                 print(u'  ' + mu)
4099                 sys.exit(0)
4100
4101         # Conflicting, missing and erroneous options
4102         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4103                 parser.error(u'using .netrc conflicts with giving username/password')
4104         if opts.password is not None and opts.username is None:
4105                 parser.error(u'account username missing')
4106         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4107                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4108         if opts.usetitle and opts.useliteral:
4109                 parser.error(u'using title conflicts with using literal title')
4110         if opts.username is not None and opts.password is None:
4111                 opts.password = getpass.getpass(u'Type account password and press return:')
4112         if opts.ratelimit is not None:
4113                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4114                 if numeric_limit is None:
4115                         parser.error(u'invalid rate limit specified')
4116                 opts.ratelimit = numeric_limit
4117         if opts.retries is not None:
4118                 try:
4119                         opts.retries = long(opts.retries)
4120                 except (TypeError, ValueError), err:
4121                         parser.error(u'invalid retry count specified')
4122         try:
4123                 opts.playliststart = int(opts.playliststart)
4124                 if opts.playliststart <= 0:
4125                         raise ValueError(u'Playlist start must be positive')
4126         except (TypeError, ValueError), err:
4127                 parser.error(u'invalid playlist start number specified')
4128         try:
4129                 opts.playlistend = int(opts.playlistend)
4130                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4131                         raise ValueError(u'Playlist end must be greater than playlist start')
4132         except (TypeError, ValueError), err:
4133                 parser.error(u'invalid playlist end number specified')
4134         if opts.extractaudio:
4135                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4136                         parser.error(u'invalid audio format specified')
4137
4138         # File downloader
4139         fd = FileDownloader({
4140                 'usenetrc': opts.usenetrc,
4141                 'username': opts.username,
4142                 'password': opts.password,
4143                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4144                 'forceurl': opts.geturl,
4145                 'forcetitle': opts.gettitle,
4146                 'forcethumbnail': opts.getthumbnail,
4147                 'forcedescription': opts.getdescription,
4148                 'forcefilename': opts.getfilename,
4149                 'forceformat': opts.getformat,
4150                 'simulate': opts.simulate,
4151                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4152                 'format': opts.format,
4153                 'format_limit': opts.format_limit,
4154                 'listformats': opts.listformats,
4155                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4156                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4157                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4158                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4159                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4160                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4161                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4162                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4163                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4164                         or u'%(id)s.%(ext)s'),
4165                 'ignoreerrors': opts.ignoreerrors,
4166                 'ratelimit': opts.ratelimit,
4167                 'nooverwrites': opts.nooverwrites,
4168                 'retries': opts.retries,
4169                 'continuedl': opts.continue_dl,
4170                 'noprogress': opts.noprogress,
4171                 'playliststart': opts.playliststart,
4172                 'playlistend': opts.playlistend,
4173                 'logtostderr': opts.outtmpl == '-',
4174                 'consoletitle': opts.consoletitle,
4175                 'nopart': opts.nopart,
4176                 'updatetime': opts.updatetime,
4177                 'writedescription': opts.writedescription,
4178                 'writeinfojson': opts.writeinfojson,
4179                 'matchtitle': opts.matchtitle,
4180                 'rejecttitle': opts.rejecttitle,
4181                 })
4182         for extractor in extractors:
4183                 fd.add_info_extractor(extractor)
4184
4185         # PostProcessors
4186         if opts.extractaudio:
4187                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4188
4189         # Update version
4190         if opts.update_self:
4191                 updateSelf(fd, sys.argv[0])
4192
4193         # Maybe do nothing
4194         if len(all_urls) < 1:
4195                 if not opts.update_self:
4196                         parser.error(u'you must provide at least one URL')
4197                 else:
4198                         sys.exit()
4199         retcode = fd.download(all_urls)
4200
4201         # Dump cookie jar if requested
4202         if opts.cookiefile is not None:
4203                 try:
4204                         jar.save()
4205                 except (IOError, OSError), err:
4206                         sys.exit(u'ERROR: unable to save cookie jar')
4207
4208         sys.exit(retcode)
4209
4210 def main():
4211         try:
4212                 _real_main()
4213         except DownloadError:
4214                 sys.exit(1)
4215         except SameFileError:
4216                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4217         except KeyboardInterrupt:
4218                 sys.exit(u'\nERROR: Interrupted by user')
4219
4220 if __name__ == '__main__':
4221         main()
4222
4223 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: