2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
56 except ImportError: # Python 2.4
59 import cStringIO as StringIO
63 # parse_qs was moved from the cgi module to the urlparse module recently.
65 from urlparse import parse_qs
67 from cgi import parse_qs
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
102 raiseError('Premature end', i)
104 def decodeEscape(match):
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
132 while s[e-bslashes-1] == '\\':
134 if bslashes % 2 == 1:
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
145 if s[i] == '}': # Empty dictionary
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
160 raiseError('Expected comma or closing curly brace', i)
165 if s[i] == ']': # Empty array
170 i = skipSpace(i) # Raise exception if premature end
174 raiseError('Expected a comma or closing bracket', i)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
180 raiseError('Not a boolean (or null)', i)
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184 raiseError('Not a number', i)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
200 def preferredencoding():
201 """Get preferred encoding.
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
206 def yield_preferredencoding():
208 pref = locale.getpreferredencoding()
214 return yield_preferredencoding().next()
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
223 entity = matchobj.group(1)
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
235 numstr = u'0%s' % numstr
238 return unichr(long(numstr, base))
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
258 It returns the tuple (stream, definitive_file_name).
262 if sys.platform == 'win32':
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
297 def _unescapeHTML(s):
299 @param s a string (of type unicode)
301 assert type(s) == type(u'')
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
306 def _encodeFilename(s):
308 @param s The name of the file (of type unicode)
311 assert type(s) == type(u'')
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
321 class DownloadError(Exception):
322 """Download Error exception.
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
331 class SameFileError(Exception):
332 """Same File exception.
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
340 class PostProcessingError(Exception):
341 """Post Processing exception.
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
388 Part of this code was copied from:
390 http://techknack.net/python-urllib2-handlers/
392 Andrew Rowls, the author of that code, agreed to release it to the
399 return zlib.decompress(data, -zlib.MAX_WBITS)
401 return zlib.decompress(data)
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
411 def http_request(self, req):
412 for h in std_headers:
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
422 def http_response(self, req, resp):
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
437 class FileDownloader(object):
438 """File Downloader class.
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
493 writesubtitles: Write the video subtitles to a .srt file
499 _download_retcode = None
500 _num_downloads = None
503 def __init__(self, params):
504 """Create a FileDownloader object with the given options."""
507 self._download_retcode = 0
508 self._num_downloads = 0
509 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
513 def format_bytes(bytes):
516 if type(bytes) is str:
521 exponent = long(math.log(bytes, 1024.0))
522 suffix = 'bkMGTPEZY'[exponent]
523 converted = float(bytes) / float(1024 ** exponent)
524 return '%.2f%s' % (converted, suffix)
527 def calc_percent(byte_counter, data_len):
530 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
533 def calc_eta(start, now, total, current):
537 if current == 0 or dif < 0.001: # One millisecond
539 rate = float(current) / dif
540 eta = long((float(total) - float(current)) / rate)
541 (eta_mins, eta_secs) = divmod(eta, 60)
544 return '%02d:%02d' % (eta_mins, eta_secs)
547 def calc_speed(start, now, bytes):
549 if bytes == 0 or dif < 0.001: # One millisecond
550 return '%10s' % '---b/s'
551 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
554 def best_block_size(elapsed_time, bytes):
555 new_min = max(bytes / 2.0, 1.0)
556 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
557 if elapsed_time < 0.001:
559 rate = bytes / elapsed_time
567 def parse_bytes(bytestr):
568 """Parse a string indicating a byte quantity into a long integer."""
569 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
572 number = float(matchobj.group(1))
573 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
574 return long(round(number * multiplier))
576 def add_info_extractor(self, ie):
577 """Add an InfoExtractor object to the end of the list."""
579 ie.set_downloader(self)
581 def add_post_processor(self, pp):
582 """Add a PostProcessor object to the end of the chain."""
584 pp.set_downloader(self)
586 def to_screen(self, message, skip_eol=False):
587 """Print message to stdout if not in quiet mode."""
588 assert type(message) == type(u'')
589 if not self.params.get('quiet', False):
590 terminator = [u'\n', u''][skip_eol]
591 output = message + terminator
593 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
594 output = output.encode(preferredencoding(), 'ignore')
595 self._screen_file.write(output)
596 self._screen_file.flush()
598 def to_stderr(self, message):
599 """Print message to stderr."""
600 print >>sys.stderr, message.encode(preferredencoding())
602 def to_cons_title(self, message):
603 """Set console/terminal window title to message."""
604 if not self.params.get('consoletitle', False):
606 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
607 # c_wchar_p() might not be necessary if `message` is
608 # already of type unicode()
609 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
610 elif 'TERM' in os.environ:
611 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
613 def fixed_template(self):
614 """Checks if the output template is fixed."""
615 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
617 def trouble(self, message=None):
618 """Determine action to take when a download problem appears.
620 Depending on if the downloader has been configured to ignore
621 download errors or not, this method may throw an exception or
622 not when errors are found, after printing the message.
624 if message is not None:
625 self.to_stderr(message)
626 if not self.params.get('ignoreerrors', False):
627 raise DownloadError(message)
628 self._download_retcode = 1
630 def slow_down(self, start_time, byte_counter):
631 """Sleep if the download speed is over the rate limit."""
632 rate_limit = self.params.get('ratelimit', None)
633 if rate_limit is None or byte_counter == 0:
636 elapsed = now - start_time
639 speed = float(byte_counter) / elapsed
640 if speed > rate_limit:
641 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
643 def temp_name(self, filename):
644 """Returns a temporary filename for the given filename."""
645 if self.params.get('nopart', False) or filename == u'-' or \
646 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
648 return filename + u'.part'
650 def undo_temp_name(self, filename):
651 if filename.endswith(u'.part'):
652 return filename[:-len(u'.part')]
655 def try_rename(self, old_filename, new_filename):
657 if old_filename == new_filename:
659 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
660 except (IOError, OSError), err:
661 self.trouble(u'ERROR: unable to rename file')
663 def try_utime(self, filename, last_modified_hdr):
664 """Try to set the last-modified time of the given file."""
665 if last_modified_hdr is None:
667 if not os.path.isfile(_encodeFilename(filename)):
669 timestr = last_modified_hdr
672 filetime = timeconvert(timestr)
676 os.utime(filename, (time.time(), filetime))
681 def report_writedescription(self, descfn):
682 """ Report that the description file is being written """
683 self.to_screen(u'[info] Writing video description to: ' + descfn)
685 def report_writesubtitles(self, srtfn):
686 """ Report that the subtitles file is being written """
687 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
689 def report_writeinfojson(self, infofn):
690 """ Report that the metadata file has been written """
691 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
693 def report_destination(self, filename):
694 """Report destination filename."""
695 self.to_screen(u'[download] Destination: ' + filename)
697 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
698 """Report download progress."""
699 if self.params.get('noprogress', False):
701 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
702 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
703 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
704 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
706 def report_resuming_byte(self, resume_len):
707 """Report attempt to resume at given byte."""
708 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
710 def report_retry(self, count, retries):
711 """Report retry in case of HTTP error 5xx"""
712 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
714 def report_file_already_downloaded(self, file_name):
715 """Report file has already been fully downloaded."""
717 self.to_screen(u'[download] %s has already been downloaded' % file_name)
718 except (UnicodeEncodeError), err:
719 self.to_screen(u'[download] The file has already been downloaded')
721 def report_unable_to_resume(self):
722 """Report it was impossible to resume download."""
723 self.to_screen(u'[download] Unable to resume')
725 def report_finish(self):
726 """Report download finished."""
727 if self.params.get('noprogress', False):
728 self.to_screen(u'[download] Download completed')
732 def increment_downloads(self):
733 """Increment the ordinal that assigns a number to each file."""
734 self._num_downloads += 1
736 def prepare_filename(self, info_dict):
737 """Generate the output filename."""
739 template_dict = dict(info_dict)
740 template_dict['epoch'] = unicode(long(time.time()))
741 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
742 filename = self.params['outtmpl'] % template_dict
744 except (ValueError, KeyError), err:
745 self.trouble(u'ERROR: invalid system charset or erroneous output template')
748 def _match_entry(self, info_dict):
749 """ Returns None iff the file should be downloaded """
751 title = info_dict['title']
752 matchtitle = self.params.get('matchtitle', False)
753 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
754 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
755 rejecttitle = self.params.get('rejecttitle', False)
756 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
757 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
760 def process_info(self, info_dict):
761 """Process a single dictionary returned by an InfoExtractor."""
763 reason = self._match_entry(info_dict)
764 if reason is not None:
765 self.to_screen(u'[download] ' + reason)
768 max_downloads = self.params.get('max_downloads')
769 if max_downloads is not None:
770 if self._num_downloads > int(max_downloads):
771 raise MaxDownloadsReached()
773 filename = self.prepare_filename(info_dict)
776 if self.params.get('forcetitle', False):
777 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
778 if self.params.get('forceurl', False):
779 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
780 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
781 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
782 if self.params.get('forcedescription', False) and 'description' in info_dict:
783 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
784 if self.params.get('forcefilename', False) and filename is not None:
785 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
786 if self.params.get('forceformat', False):
787 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
789 # Do nothing else if in simulate mode
790 if self.params.get('simulate', False):
797 dn = os.path.dirname(_encodeFilename(filename))
798 if dn != '' and not os.path.exists(dn): # dn is already encoded
800 except (OSError, IOError), err:
801 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
804 if self.params.get('writedescription', False):
806 descfn = filename + u'.description'
807 self.report_writedescription(descfn)
808 descfile = open(_encodeFilename(descfn), 'wb')
810 descfile.write(info_dict['description'].encode('utf-8'))
813 except (OSError, IOError):
814 self.trouble(u'ERROR: Cannot write description file ' + descfn)
817 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
818 # subtitles download errors are already managed as troubles in relevant IE
819 # that way it will silently go on when used with unsupporting IE
821 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
822 self.report_writesubtitles(srtfn)
823 srtfile = open(_encodeFilename(srtfn), 'wb')
825 srtfile.write(info_dict['subtitles'].encode('utf-8'))
828 except (OSError, IOError):
829 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
832 if self.params.get('writeinfojson', False):
833 infofn = filename + u'.info.json'
834 self.report_writeinfojson(infofn)
837 except (NameError,AttributeError):
838 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
841 infof = open(_encodeFilename(infofn), 'wb')
843 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
844 json.dump(json_info_dict, infof)
847 except (OSError, IOError):
848 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
851 if not self.params.get('skip_download', False):
852 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
856 success = self._do_download(filename, info_dict)
857 except (OSError, IOError), err:
858 raise UnavailableVideoError
859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
860 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
862 except (ContentTooShortError, ), err:
863 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
868 self.post_process(filename, info_dict)
869 except (PostProcessingError), err:
870 self.trouble(u'ERROR: postprocessing: %s' % str(err))
873 def download(self, url_list):
874 """Download a given list of URLs."""
875 if len(url_list) > 1 and self.fixed_template():
876 raise SameFileError(self.params['outtmpl'])
879 suitable_found = False
881 # Go to next InfoExtractor if not suitable
882 if not ie.suitable(url):
885 # Suitable InfoExtractor found
886 suitable_found = True
888 # Extract information from URL and process it
891 # Suitable InfoExtractor had been found; go to next URL
894 if not suitable_found:
895 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
897 return self._download_retcode
899 def post_process(self, filename, ie_info):
900 """Run the postprocessing chain on the given file."""
902 info['filepath'] = filename
908 def _download_with_rtmpdump(self, filename, url, player_url):
909 self.report_destination(filename)
910 tmpfilename = self.temp_name(filename)
912 # Check for rtmpdump first
914 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
915 except (OSError, IOError):
916 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
919 # Download using rtmpdump. rtmpdump returns exit code 2 when
920 # the connection was interrumpted and resuming appears to be
921 # possible. This is part of rtmpdump's normal usage, AFAIK.
922 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
923 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
924 if self.params.get('verbose', False):
927 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
930 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
931 retval = subprocess.call(args)
932 while retval == 2 or retval == 1:
933 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
934 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
935 time.sleep(5.0) # This seems to be needed
936 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
937 cursize = os.path.getsize(_encodeFilename(tmpfilename))
938 if prevsize == cursize and retval == 1:
940 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
941 if prevsize == cursize and retval == 2 and cursize > 1024:
942 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
946 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
947 self.try_rename(tmpfilename, filename)
950 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
953 def _do_download(self, filename, info_dict):
954 url = info_dict['url']
955 player_url = info_dict.get('player_url', None)
957 # Check file already present
958 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
959 self.report_file_already_downloaded(filename)
962 # Attempt to download using rtmpdump
963 if url.startswith('rtmp'):
964 return self._download_with_rtmpdump(filename, url, player_url)
966 tmpfilename = self.temp_name(filename)
969 # Do not include the Accept-Encoding header
970 headers = {'Youtubedl-no-compression': 'True'}
971 basic_request = urllib2.Request(url, None, headers)
972 request = urllib2.Request(url, None, headers)
974 # Establish possible resume length
975 if os.path.isfile(_encodeFilename(tmpfilename)):
976 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
982 if self.params.get('continuedl', False):
983 self.report_resuming_byte(resume_len)
984 request.add_header('Range','bytes=%d-' % resume_len)
990 retries = self.params.get('retries', 0)
991 while count <= retries:
992 # Establish connection
994 if count == 0 and 'urlhandle' in info_dict:
995 data = info_dict['urlhandle']
996 data = urllib2.urlopen(request)
998 except (urllib2.HTTPError, ), err:
999 if (err.code < 500 or err.code >= 600) and err.code != 416:
1000 # Unexpected HTTP error
1002 elif err.code == 416:
1003 # Unable to resume (requested range not satisfiable)
1005 # Open the connection again without the range header
1006 data = urllib2.urlopen(basic_request)
1007 content_length = data.info()['Content-Length']
1008 except (urllib2.HTTPError, ), err:
1009 if err.code < 500 or err.code >= 600:
1012 # Examine the reported length
1013 if (content_length is not None and
1014 (resume_len - 100 < long(content_length) < resume_len + 100)):
1015 # The file had already been fully downloaded.
1016 # Explanation to the above condition: in issue #175 it was revealed that
1017 # YouTube sometimes adds or removes a few bytes from the end of the file,
1018 # changing the file size slightly and causing problems for some users. So
1019 # I decided to implement a suggested change and consider the file
1020 # completely downloaded if the file size differs less than 100 bytes from
1021 # the one in the hard drive.
1022 self.report_file_already_downloaded(filename)
1023 self.try_rename(tmpfilename, filename)
1026 # The length does not match, we start the download over
1027 self.report_unable_to_resume()
1032 if count <= retries:
1033 self.report_retry(count, retries)
1036 self.trouble(u'ERROR: giving up after %s retries' % retries)
1039 data_len = data.info().get('Content-length', None)
1040 if data_len is not None:
1041 data_len = long(data_len) + resume_len
1042 data_len_str = self.format_bytes(data_len)
1043 byte_counter = 0 + resume_len
1047 # Download and write
1048 before = time.time()
1049 data_block = data.read(block_size)
1051 if len(data_block) == 0:
1053 byte_counter += len(data_block)
1055 # Open file just in time
1058 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1059 assert stream is not None
1060 filename = self.undo_temp_name(tmpfilename)
1061 self.report_destination(filename)
1062 except (OSError, IOError), err:
1063 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1066 stream.write(data_block)
1067 except (IOError, OSError), err:
1068 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1070 block_size = self.best_block_size(after - before, len(data_block))
1073 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1074 if data_len is None:
1075 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1077 percent_str = self.calc_percent(byte_counter, data_len)
1078 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1079 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1082 self.slow_down(start, byte_counter - resume_len)
1085 self.trouble(u'\nERROR: Did not get any data blocks')
1088 self.report_finish()
1089 if data_len is not None and byte_counter != data_len:
1090 raise ContentTooShortError(byte_counter, long(data_len))
1091 self.try_rename(tmpfilename, filename)
1093 # Update file modification time
1094 if self.params.get('updatetime', True):
1095 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1100 class InfoExtractor(object):
1101 """Information Extractor class.
1103 Information extractors are the classes that, given a URL, extract
1104 information from the video (or videos) the URL refers to. This
1105 information includes the real video URL, the video title and simplified
1106 title, author and others. The information is stored in a dictionary
1107 which is then passed to the FileDownloader. The FileDownloader
1108 processes this information possibly downloading the video to the file
1109 system, among other possible outcomes. The dictionaries must include
1110 the following fields:
1112 id: Video identifier.
1113 url: Final video URL.
1114 uploader: Nickname of the video uploader.
1115 title: Literal title.
1116 stitle: Simplified title.
1117 ext: Video filename extension.
1118 format: Video format.
1119 player_url: SWF Player URL (may be None).
1121 The following fields are optional. Their primary purpose is to allow
1122 youtube-dl to serve as the backend for a video search function, such
1123 as the one in youtube2mp3. They are only used when their respective
1124 forced printing functions are called:
1126 thumbnail: Full URL to a video thumbnail image.
1127 description: One-line video description.
1129 Subclasses of this one should re-define the _real_initialize() and
1130 _real_extract() methods and define a _VALID_URL regexp.
1131 Probably, they should also be added to the list of extractors.
1137 def __init__(self, downloader=None):
1138 """Constructor. Receives an optional downloader."""
1140 self.set_downloader(downloader)
1142 def suitable(self, url):
1143 """Receives a URL and returns True if suitable for this IE."""
1144 return re.match(self._VALID_URL, url) is not None
1146 def initialize(self):
1147 """Initializes an instance (authentication, etc)."""
1149 self._real_initialize()
1152 def extract(self, url):
1153 """Extracts URL information and returns it in list of dicts."""
1155 return self._real_extract(url)
1157 def set_downloader(self, downloader):
1158 """Sets the downloader for this IE."""
1159 self._downloader = downloader
1161 def _real_initialize(self):
1162 """Real initialization process. Redefine in subclasses."""
1165 def _real_extract(self, url):
1166 """Real extraction process. Redefine in subclasses."""
1170 class YoutubeIE(InfoExtractor):
1171 """Information extractor for youtube.com."""
1173 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1174 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1175 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1176 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1177 _NETRC_MACHINE = 'youtube'
1178 # Listed in order of quality
1179 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1180 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1181 _video_extensions = {
1187 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1192 _video_dimensions = {
1207 IE_NAME = u'youtube'
1209 def report_lang(self):
1210 """Report attempt to set language."""
1211 self._downloader.to_screen(u'[youtube] Setting language')
1213 def report_login(self):
1214 """Report attempt to log in."""
1215 self._downloader.to_screen(u'[youtube] Logging in')
1217 def report_age_confirmation(self):
1218 """Report attempt to confirm age."""
1219 self._downloader.to_screen(u'[youtube] Confirming age')
1221 def report_video_webpage_download(self, video_id):
1222 """Report attempt to download video webpage."""
1223 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1225 def report_video_info_webpage_download(self, video_id):
1226 """Report attempt to download video info webpage."""
1227 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1229 def report_video_subtitles_download(self, video_id):
1230 """Report attempt to download video info webpage."""
1231 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1233 def report_information_extraction(self, video_id):
1234 """Report attempt to extract video information."""
1235 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1237 def report_unavailable_format(self, video_id, format):
1238 """Report extracted video URL."""
1239 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1241 def report_rtmp_download(self):
1242 """Indicate the download will use the RTMP protocol."""
1243 self._downloader.to_screen(u'[youtube] RTMP download detected')
1245 def _closed_captions_xml_to_srt(self, xml_string):
1247 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1248 # TODO parse xml instead of regex
1249 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1250 if not dur: dur = '4'
1251 start = float(start)
1252 end = start + float(dur)
1253 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1254 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1255 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1256 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1257 srt += str(n) + '\n'
1258 srt += start + ' --> ' + end + '\n'
1259 srt += caption + '\n\n'
1262 def _print_formats(self, formats):
1263 print 'Available formats:'
1265 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1267 def _real_initialize(self):
1268 if self._downloader is None:
1273 downloader_params = self._downloader.params
1275 # Attempt to use provided username and password or .netrc data
1276 if downloader_params.get('username', None) is not None:
1277 username = downloader_params['username']
1278 password = downloader_params['password']
1279 elif downloader_params.get('usenetrc', False):
1281 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1282 if info is not None:
1286 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1287 except (IOError, netrc.NetrcParseError), err:
1288 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1292 request = urllib2.Request(self._LANG_URL)
1295 urllib2.urlopen(request).read()
1296 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1297 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1300 # No authentication to be performed
1301 if username is None:
1306 'current_form': 'loginForm',
1308 'action_login': 'Log In',
1309 'username': username,
1310 'password': password,
1312 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1315 login_results = urllib2.urlopen(request).read()
1316 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1317 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1319 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1320 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1326 'action_confirm': 'Confirm',
1328 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1330 self.report_age_confirmation()
1331 age_results = urllib2.urlopen(request).read()
1332 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1333 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1336 def _real_extract(self, url):
1337 # Extract video id from URL
1338 mobj = re.match(self._VALID_URL, url)
1340 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1342 video_id = mobj.group(2)
1345 self.report_video_webpage_download(video_id)
1346 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1348 video_webpage = urllib2.urlopen(request).read()
1349 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1350 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1353 # Attempt to extract SWF player URL
1354 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1355 if mobj is not None:
1356 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1361 self.report_video_info_webpage_download(video_id)
1362 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1363 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1364 % (video_id, el_type))
1365 request = urllib2.Request(video_info_url)
1367 video_info_webpage = urllib2.urlopen(request).read()
1368 video_info = parse_qs(video_info_webpage)
1369 if 'token' in video_info:
1371 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1372 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1374 if 'token' not in video_info:
1375 if 'reason' in video_info:
1376 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1378 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1381 # Start extracting information
1382 self.report_information_extraction(video_id)
1385 if 'author' not in video_info:
1386 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1388 video_uploader = urllib.unquote_plus(video_info['author'][0])
1391 if 'title' not in video_info:
1392 self._downloader.trouble(u'ERROR: unable to extract video title')
1394 video_title = urllib.unquote_plus(video_info['title'][0])
1395 video_title = video_title.decode('utf-8')
1396 video_title = sanitize_title(video_title)
1399 simple_title = _simplify_title(video_title)
1402 if 'thumbnail_url' not in video_info:
1403 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1404 video_thumbnail = ''
1405 else: # don't panic if we can't find it
1406 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1410 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1411 if mobj is not None:
1412 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1413 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1414 for expression in format_expressions:
1416 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1424 video_description = u'No description available.'
1425 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1426 if mobj is not None:
1427 video_description = mobj.group(1).decode('utf-8')
1429 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1430 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1431 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1432 # TODO use another parser
1435 video_subtitles = None
1436 if self._downloader.params.get('writesubtitles', False):
1437 self.report_video_subtitles_download(video_id)
1438 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1440 srt_list = urllib2.urlopen(request).read()
1441 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1444 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1446 if 'en' in srt_lang_list: srt_lang = 'en'
1447 else: srt_lang = srt_lang_list[0] # TODO choose better and provide an override
1448 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1450 srt_xml = urllib2.urlopen(request).read()
1451 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1452 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1454 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1456 self._downloader.trouble(u'WARNING: video has no subtitles')
1459 video_token = urllib.unquote_plus(video_info['token'][0])
1461 # Decide which formats to download
1462 req_format = self._downloader.params.get('format', None)
1464 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1465 self.report_rtmp_download()
1466 video_url_list = [(None, video_info['conn'][0])]
1467 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1468 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1469 url_data = [parse_qs(uds) for uds in url_data_strs]
1470 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1471 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1473 format_limit = self._downloader.params.get('format_limit', None)
1474 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1475 if format_limit is not None and format_limit in available_formats:
1476 format_list = available_formats[available_formats.index(format_limit):]
1478 format_list = available_formats
1479 existing_formats = [x for x in format_list if x in url_map]
1480 if len(existing_formats) == 0:
1481 self._downloader.trouble(u'ERROR: no known formats available for video')
1483 if self._downloader.params.get('listformats', None):
1484 self._print_formats(existing_formats)
1486 if req_format is None or req_format == 'best':
1487 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1488 elif req_format == 'worst':
1489 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1490 elif req_format in ('-1', 'all'):
1491 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1493 # Specific formats. We pick the first in a slash-delimeted sequence.
1494 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1495 req_formats = req_format.split('/')
1496 video_url_list = None
1497 for rf in req_formats:
1499 video_url_list = [(rf, url_map[rf])]
1501 if video_url_list is None:
1502 self._downloader.trouble(u'ERROR: requested format not available')
1505 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1508 for format_param, video_real_url in video_url_list:
1509 # At this point we have a new video
1510 self._downloader.increment_downloads()
1513 video_extension = self._video_extensions.get(format_param, 'flv')
1516 # Process video information
1517 self._downloader.process_info({
1518 'id': video_id.decode('utf-8'),
1519 'url': video_real_url.decode('utf-8'),
1520 'uploader': video_uploader.decode('utf-8'),
1521 'upload_date': upload_date,
1522 'title': video_title,
1523 'stitle': simple_title,
1524 'ext': video_extension.decode('utf-8'),
1525 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1526 'thumbnail': video_thumbnail.decode('utf-8'),
1527 'description': video_description,
1528 'player_url': player_url,
1529 'subtitles': video_subtitles
1531 except UnavailableVideoError, err:
1532 self._downloader.trouble(u'\nERROR: unable to download video')
1535 class MetacafeIE(InfoExtractor):
1536 """Information Extractor for metacafe.com."""
1538 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1539 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1540 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1542 IE_NAME = u'metacafe'
1544 def __init__(self, youtube_ie, downloader=None):
1545 InfoExtractor.__init__(self, downloader)
1546 self._youtube_ie = youtube_ie
1548 def report_disclaimer(self):
1549 """Report disclaimer retrieval."""
1550 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1552 def report_age_confirmation(self):
1553 """Report attempt to confirm age."""
1554 self._downloader.to_screen(u'[metacafe] Confirming age')
1556 def report_download_webpage(self, video_id):
1557 """Report webpage download."""
1558 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1560 def report_extraction(self, video_id):
1561 """Report information extraction."""
1562 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1564 def _real_initialize(self):
1565 # Retrieve disclaimer
1566 request = urllib2.Request(self._DISCLAIMER)
1568 self.report_disclaimer()
1569 disclaimer = urllib2.urlopen(request).read()
1570 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1571 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1577 'submit': "Continue - I'm over 18",
1579 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1581 self.report_age_confirmation()
1582 disclaimer = urllib2.urlopen(request).read()
1583 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1584 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1587 def _real_extract(self, url):
1588 # Extract id and simplified title from URL
1589 mobj = re.match(self._VALID_URL, url)
1591 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1594 video_id = mobj.group(1)
1596 # Check if video comes from YouTube
1597 mobj2 = re.match(r'^yt-(.*)$', video_id)
1598 if mobj2 is not None:
1599 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1602 # At this point we have a new video
1603 self._downloader.increment_downloads()
1605 simple_title = mobj.group(2).decode('utf-8')
1607 # Retrieve video webpage to extract further information
1608 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1610 self.report_download_webpage(video_id)
1611 webpage = urllib2.urlopen(request).read()
1612 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1613 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1616 # Extract URL, uploader and title from webpage
1617 self.report_extraction(video_id)
1618 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1619 if mobj is not None:
1620 mediaURL = urllib.unquote(mobj.group(1))
1621 video_extension = mediaURL[-3:]
1623 # Extract gdaKey if available
1624 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1626 video_url = mediaURL
1628 gdaKey = mobj.group(1)
1629 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1631 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1633 self._downloader.trouble(u'ERROR: unable to extract media URL')
1635 vardict = parse_qs(mobj.group(1))
1636 if 'mediaData' not in vardict:
1637 self._downloader.trouble(u'ERROR: unable to extract media URL')
1639 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1641 self._downloader.trouble(u'ERROR: unable to extract media URL')
1643 mediaURL = mobj.group(1).replace('\\/', '/')
1644 video_extension = mediaURL[-3:]
1645 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1647 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1649 self._downloader.trouble(u'ERROR: unable to extract title')
1651 video_title = mobj.group(1).decode('utf-8')
1652 video_title = sanitize_title(video_title)
1654 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1656 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1658 video_uploader = mobj.group(1)
1661 # Process video information
1662 self._downloader.process_info({
1663 'id': video_id.decode('utf-8'),
1664 'url': video_url.decode('utf-8'),
1665 'uploader': video_uploader.decode('utf-8'),
1666 'upload_date': u'NA',
1667 'title': video_title,
1668 'stitle': simple_title,
1669 'ext': video_extension.decode('utf-8'),
1673 except UnavailableVideoError:
1674 self._downloader.trouble(u'\nERROR: unable to download video')
1677 class DailymotionIE(InfoExtractor):
1678 """Information Extractor for Dailymotion"""
1680 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1681 IE_NAME = u'dailymotion'
1683 def __init__(self, downloader=None):
1684 InfoExtractor.__init__(self, downloader)
1686 def report_download_webpage(self, video_id):
1687 """Report webpage download."""
1688 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1690 def report_extraction(self, video_id):
1691 """Report information extraction."""
1692 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1694 def _real_extract(self, url):
1695 # Extract id and simplified title from URL
1696 mobj = re.match(self._VALID_URL, url)
1698 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1701 # At this point we have a new video
1702 self._downloader.increment_downloads()
1703 video_id = mobj.group(1)
1705 video_extension = 'flv'
1707 # Retrieve video webpage to extract further information
1708 request = urllib2.Request(url)
1709 request.add_header('Cookie', 'family_filter=off')
1711 self.report_download_webpage(video_id)
1712 webpage = urllib2.urlopen(request).read()
1713 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1714 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1717 # Extract URL, uploader and title from webpage
1718 self.report_extraction(video_id)
1719 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1721 self._downloader.trouble(u'ERROR: unable to extract media URL')
1723 sequence = urllib.unquote(mobj.group(1))
1724 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1726 self._downloader.trouble(u'ERROR: unable to extract media URL')
1728 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1730 # if needed add http://www.dailymotion.com/ if relative URL
1732 video_url = mediaURL
1734 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1736 self._downloader.trouble(u'ERROR: unable to extract title')
1738 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1739 video_title = sanitize_title(video_title)
1740 simple_title = _simplify_title(video_title)
1742 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1744 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1746 video_uploader = mobj.group(1)
1749 # Process video information
1750 self._downloader.process_info({
1751 'id': video_id.decode('utf-8'),
1752 'url': video_url.decode('utf-8'),
1753 'uploader': video_uploader.decode('utf-8'),
1754 'upload_date': u'NA',
1755 'title': video_title,
1756 'stitle': simple_title,
1757 'ext': video_extension.decode('utf-8'),
1761 except UnavailableVideoError:
1762 self._downloader.trouble(u'\nERROR: unable to download video')
1765 class GoogleIE(InfoExtractor):
1766 """Information extractor for video.google.com."""
1768 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1769 IE_NAME = u'video.google'
1771 def __init__(self, downloader=None):
1772 InfoExtractor.__init__(self, downloader)
1774 def report_download_webpage(self, video_id):
1775 """Report webpage download."""
1776 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1778 def report_extraction(self, video_id):
1779 """Report information extraction."""
1780 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1782 def _real_extract(self, url):
1783 # Extract id from URL
1784 mobj = re.match(self._VALID_URL, url)
1786 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1789 # At this point we have a new video
1790 self._downloader.increment_downloads()
1791 video_id = mobj.group(1)
1793 video_extension = 'mp4'
1795 # Retrieve video webpage to extract further information
1796 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1798 self.report_download_webpage(video_id)
1799 webpage = urllib2.urlopen(request).read()
1800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1801 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1804 # Extract URL, uploader, and title from webpage
1805 self.report_extraction(video_id)
1806 mobj = re.search(r"download_url:'([^']+)'", webpage)
1808 video_extension = 'flv'
1809 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1811 self._downloader.trouble(u'ERROR: unable to extract media URL')
1813 mediaURL = urllib.unquote(mobj.group(1))
1814 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1815 mediaURL = mediaURL.replace('\\x26', '\x26')
1817 video_url = mediaURL
1819 mobj = re.search(r'<title>(.*)</title>', webpage)
1821 self._downloader.trouble(u'ERROR: unable to extract title')
1823 video_title = mobj.group(1).decode('utf-8')
1824 video_title = sanitize_title(video_title)
1825 simple_title = _simplify_title(video_title)
1827 # Extract video description
1828 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1830 self._downloader.trouble(u'ERROR: unable to extract video description')
1832 video_description = mobj.group(1).decode('utf-8')
1833 if not video_description:
1834 video_description = 'No description available.'
1836 # Extract video thumbnail
1837 if self._downloader.params.get('forcethumbnail', False):
1838 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1840 webpage = urllib2.urlopen(request).read()
1841 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1842 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1844 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1846 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1848 video_thumbnail = mobj.group(1)
1849 else: # we need something to pass to process_info
1850 video_thumbnail = ''
1853 # Process video information
1854 self._downloader.process_info({
1855 'id': video_id.decode('utf-8'),
1856 'url': video_url.decode('utf-8'),
1858 'upload_date': u'NA',
1859 'title': video_title,
1860 'stitle': simple_title,
1861 'ext': video_extension.decode('utf-8'),
1865 except UnavailableVideoError:
1866 self._downloader.trouble(u'\nERROR: unable to download video')
1869 class PhotobucketIE(InfoExtractor):
1870 """Information extractor for photobucket.com."""
1872 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1873 IE_NAME = u'photobucket'
1875 def __init__(self, downloader=None):
1876 InfoExtractor.__init__(self, downloader)
1878 def report_download_webpage(self, video_id):
1879 """Report webpage download."""
1880 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1882 def report_extraction(self, video_id):
1883 """Report information extraction."""
1884 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1886 def _real_extract(self, url):
1887 # Extract id from URL
1888 mobj = re.match(self._VALID_URL, url)
1890 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1893 # At this point we have a new video
1894 self._downloader.increment_downloads()
1895 video_id = mobj.group(1)
1897 video_extension = 'flv'
1899 # Retrieve video webpage to extract further information
1900 request = urllib2.Request(url)
1902 self.report_download_webpage(video_id)
1903 webpage = urllib2.urlopen(request).read()
1904 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1905 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1908 # Extract URL, uploader, and title from webpage
1909 self.report_extraction(video_id)
1910 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1912 self._downloader.trouble(u'ERROR: unable to extract media URL')
1914 mediaURL = urllib.unquote(mobj.group(1))
1916 video_url = mediaURL
1918 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1920 self._downloader.trouble(u'ERROR: unable to extract title')
1922 video_title = mobj.group(1).decode('utf-8')
1923 video_title = sanitize_title(video_title)
1924 simple_title = _simplify_title(vide_title)
1926 video_uploader = mobj.group(2).decode('utf-8')
1929 # Process video information
1930 self._downloader.process_info({
1931 'id': video_id.decode('utf-8'),
1932 'url': video_url.decode('utf-8'),
1933 'uploader': video_uploader,
1934 'upload_date': u'NA',
1935 'title': video_title,
1936 'stitle': simple_title,
1937 'ext': video_extension.decode('utf-8'),
1941 except UnavailableVideoError:
1942 self._downloader.trouble(u'\nERROR: unable to download video')
1945 class YahooIE(InfoExtractor):
1946 """Information extractor for video.yahoo.com."""
1948 # _VALID_URL matches all Yahoo! Video URLs
1949 # _VPAGE_URL matches only the extractable '/watch/' URLs
1950 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1951 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1952 IE_NAME = u'video.yahoo'
1954 def __init__(self, downloader=None):
1955 InfoExtractor.__init__(self, downloader)
1957 def report_download_webpage(self, video_id):
1958 """Report webpage download."""
1959 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1961 def report_extraction(self, video_id):
1962 """Report information extraction."""
1963 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1965 def _real_extract(self, url, new_video=True):
1966 # Extract ID from URL
1967 mobj = re.match(self._VALID_URL, url)
1969 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1972 # At this point we have a new video
1973 self._downloader.increment_downloads()
1974 video_id = mobj.group(2)
1975 video_extension = 'flv'
1977 # Rewrite valid but non-extractable URLs as
1978 # extractable English language /watch/ URLs
1979 if re.match(self._VPAGE_URL, url) is None:
1980 request = urllib2.Request(url)
1982 webpage = urllib2.urlopen(request).read()
1983 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1984 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1987 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1989 self._downloader.trouble(u'ERROR: Unable to extract id field')
1991 yahoo_id = mobj.group(1)
1993 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1995 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1997 yahoo_vid = mobj.group(1)
1999 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2000 return self._real_extract(url, new_video=False)
2002 # Retrieve video webpage to extract further information
2003 request = urllib2.Request(url)
2005 self.report_download_webpage(video_id)
2006 webpage = urllib2.urlopen(request).read()
2007 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2008 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2011 # Extract uploader and title from webpage
2012 self.report_extraction(video_id)
2013 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2015 self._downloader.trouble(u'ERROR: unable to extract video title')
2017 video_title = mobj.group(1).decode('utf-8')
2018 simple_title = _simplify_title(video_title)
2020 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2022 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2024 video_uploader = mobj.group(1).decode('utf-8')
2026 # Extract video thumbnail
2027 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2029 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2031 video_thumbnail = mobj.group(1).decode('utf-8')
2033 # Extract video description
2034 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2036 self._downloader.trouble(u'ERROR: unable to extract video description')
2038 video_description = mobj.group(1).decode('utf-8')
2039 if not video_description:
2040 video_description = 'No description available.'
2042 # Extract video height and width
2043 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2045 self._downloader.trouble(u'ERROR: unable to extract video height')
2047 yv_video_height = mobj.group(1)
2049 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2051 self._downloader.trouble(u'ERROR: unable to extract video width')
2053 yv_video_width = mobj.group(1)
2055 # Retrieve video playlist to extract media URL
2056 # I'm not completely sure what all these options are, but we
2057 # seem to need most of them, otherwise the server sends a 401.
2058 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2059 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2060 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2061 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2062 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2064 self.report_download_webpage(video_id)
2065 webpage = urllib2.urlopen(request).read()
2066 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2067 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2070 # Extract media URL from playlist XML
2071 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2073 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2075 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2076 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2079 # Process video information
2080 self._downloader.process_info({
2081 'id': video_id.decode('utf-8'),
2083 'uploader': video_uploader,
2084 'upload_date': u'NA',
2085 'title': video_title,
2086 'stitle': simple_title,
2087 'ext': video_extension.decode('utf-8'),
2088 'thumbnail': video_thumbnail.decode('utf-8'),
2089 'description': video_description,
2090 'thumbnail': video_thumbnail,
2093 except UnavailableVideoError:
2094 self._downloader.trouble(u'\nERROR: unable to download video')
2097 class VimeoIE(InfoExtractor):
2098 """Information extractor for vimeo.com."""
2100 # _VALID_URL matches Vimeo URLs
2101 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2104 def __init__(self, downloader=None):
2105 InfoExtractor.__init__(self, downloader)
2107 def report_download_webpage(self, video_id):
2108 """Report webpage download."""
2109 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2111 def report_extraction(self, video_id):
2112 """Report information extraction."""
2113 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2115 def _real_extract(self, url, new_video=True):
2116 # Extract ID from URL
2117 mobj = re.match(self._VALID_URL, url)
2119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2122 # At this point we have a new video
2123 self._downloader.increment_downloads()
2124 video_id = mobj.group(1)
2126 # Retrieve video webpage to extract further information
2127 request = urllib2.Request(url, None, std_headers)
2129 self.report_download_webpage(video_id)
2130 webpage = urllib2.urlopen(request).read()
2131 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2132 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2135 # Now we begin extracting as much information as we can from what we
2136 # retrieved. First we extract the information common to all extractors,
2137 # and latter we extract those that are Vimeo specific.
2138 self.report_extraction(video_id)
2140 # Extract the config JSON
2141 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2143 config = json.loads(config)
2145 self._downloader.trouble(u'ERROR: unable to extract info section')
2149 video_title = config["video"]["title"]
2150 simple_title = _simplify_title(video_title)
2153 video_uploader = config["video"]["owner"]["name"]
2155 # Extract video thumbnail
2156 video_thumbnail = config["video"]["thumbnail"]
2158 # Extract video description
2162 video_description = u'No description available.'
2163 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2164 if mobj is not None:
2165 video_description = mobj.group(1)
2167 html_parser = lxml.etree.HTMLParser()
2168 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2169 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2170 # TODO use another parser
2172 # Extract upload date
2173 video_upload_date = u'NA'
2174 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2175 if mobj is not None:
2176 video_upload_date = mobj.group(1)
2178 # Vimeo specific: extract request signature and timestamp
2179 sig = config['request']['signature']
2180 timestamp = config['request']['timestamp']
2182 # Vimeo specific: extract video codec and quality information
2183 # TODO bind to format param
2184 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2185 for codec in codecs:
2186 if codec[0] in config["video"]["files"]:
2187 video_codec = codec[0]
2188 video_extension = codec[1]
2189 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2190 else: quality = 'sd'
2193 self._downloader.trouble(u'ERROR: no known codec found')
2196 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2197 %(video_id, sig, timestamp, quality, video_codec.upper())
2200 # Process video information
2201 self._downloader.process_info({
2204 'uploader': video_uploader,
2205 'upload_date': video_upload_date,
2206 'title': video_title,
2207 'stitle': simple_title,
2208 'ext': video_extension,
2209 'thumbnail': video_thumbnail,
2210 'description': video_description,
2213 except UnavailableVideoError:
2214 self._downloader.trouble(u'ERROR: unable to download video')
2217 class GenericIE(InfoExtractor):
2218 """Generic last-resort information extractor."""
2221 IE_NAME = u'generic'
2223 def __init__(self, downloader=None):
2224 InfoExtractor.__init__(self, downloader)
2226 def report_download_webpage(self, video_id):
2227 """Report webpage download."""
2228 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2229 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2231 def report_extraction(self, video_id):
2232 """Report information extraction."""
2233 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2235 def _real_extract(self, url):
2236 # At this point we have a new video
2237 self._downloader.increment_downloads()
2239 video_id = url.split('/')[-1]
2240 request = urllib2.Request(url)
2242 self.report_download_webpage(video_id)
2243 webpage = urllib2.urlopen(request).read()
2244 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2245 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2247 except ValueError, err:
2248 # since this is the last-resort InfoExtractor, if
2249 # this error is thrown, it'll be thrown here
2250 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2253 self.report_extraction(video_id)
2254 # Start with something easy: JW Player in SWFObject
2255 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2257 # Broaden the search a little bit
2258 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2260 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2263 # It's possible that one of the regexes
2264 # matched, but returned an empty group:
2265 if mobj.group(1) is None:
2266 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2269 video_url = urllib.unquote(mobj.group(1))
2270 video_id = os.path.basename(video_url)
2272 # here's a fun little line of code for you:
2273 video_extension = os.path.splitext(video_id)[1][1:]
2274 video_id = os.path.splitext(video_id)[0]
2276 # it's tempting to parse this further, but you would
2277 # have to take into account all the variations like
2278 # Video Title - Site Name
2279 # Site Name | Video Title
2280 # Video Title - Tagline | Site Name
2281 # and so on and so forth; it's just not practical
2282 mobj = re.search(r'<title>(.*)</title>', webpage)
2284 self._downloader.trouble(u'ERROR: unable to extract title')
2286 video_title = mobj.group(1).decode('utf-8')
2287 video_title = sanitize_title(video_title)
2288 simple_title = _simplify_title(video_title)
2290 # video uploader is domain name
2291 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2293 self._downloader.trouble(u'ERROR: unable to extract title')
2295 video_uploader = mobj.group(1).decode('utf-8')
2298 # Process video information
2299 self._downloader.process_info({
2300 'id': video_id.decode('utf-8'),
2301 'url': video_url.decode('utf-8'),
2302 'uploader': video_uploader,
2303 'upload_date': u'NA',
2304 'title': video_title,
2305 'stitle': simple_title,
2306 'ext': video_extension.decode('utf-8'),
2310 except UnavailableVideoError, err:
2311 self._downloader.trouble(u'\nERROR: unable to download video')
2314 class YoutubeSearchIE(InfoExtractor):
2315 """Information Extractor for YouTube search queries."""
2316 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2317 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2319 _max_youtube_results = 1000
2320 IE_NAME = u'youtube:search'
2322 def __init__(self, youtube_ie, downloader=None):
2323 InfoExtractor.__init__(self, downloader)
2324 self._youtube_ie = youtube_ie
2326 def report_download_page(self, query, pagenum):
2327 """Report attempt to download playlist page with given number."""
2328 query = query.decode(preferredencoding())
2329 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2331 def _real_initialize(self):
2332 self._youtube_ie.initialize()
2334 def _real_extract(self, query):
2335 mobj = re.match(self._VALID_URL, query)
2337 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2340 prefix, query = query.split(':')
2342 query = query.encode('utf-8')
2344 self._download_n_results(query, 1)
2346 elif prefix == 'all':
2347 self._download_n_results(query, self._max_youtube_results)
2353 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2355 elif n > self._max_youtube_results:
2356 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2357 n = self._max_youtube_results
2358 self._download_n_results(query, n)
2360 except ValueError: # parsing prefix as integer fails
2361 self._download_n_results(query, 1)
2364 def _download_n_results(self, query, n):
2365 """Downloads a specified number of results for a query"""
2371 while (50 * pagenum) < limit:
2372 self.report_download_page(query, pagenum+1)
2373 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2374 request = urllib2.Request(result_url)
2376 data = urllib2.urlopen(request).read()
2377 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2378 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2380 api_response = json.loads(data)['data']
2382 new_ids = list(video['id'] for video in api_response['items'])
2383 video_ids += new_ids
2385 limit = min(n, api_response['totalItems'])
2388 if len(video_ids) > n:
2389 video_ids = video_ids[:n]
2390 for id in video_ids:
2391 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2395 class GoogleSearchIE(InfoExtractor):
2396 """Information Extractor for Google Video search queries."""
2397 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2398 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2399 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2400 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2402 _max_google_results = 1000
2403 IE_NAME = u'video.google:search'
2405 def __init__(self, google_ie, downloader=None):
2406 InfoExtractor.__init__(self, downloader)
2407 self._google_ie = google_ie
2409 def report_download_page(self, query, pagenum):
2410 """Report attempt to download playlist page with given number."""
2411 query = query.decode(preferredencoding())
2412 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2414 def _real_initialize(self):
2415 self._google_ie.initialize()
2417 def _real_extract(self, query):
2418 mobj = re.match(self._VALID_URL, query)
2420 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2423 prefix, query = query.split(':')
2425 query = query.encode('utf-8')
2427 self._download_n_results(query, 1)
2429 elif prefix == 'all':
2430 self._download_n_results(query, self._max_google_results)
2436 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2438 elif n > self._max_google_results:
2439 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2440 n = self._max_google_results
2441 self._download_n_results(query, n)
2443 except ValueError: # parsing prefix as integer fails
2444 self._download_n_results(query, 1)
2447 def _download_n_results(self, query, n):
2448 """Downloads a specified number of results for a query"""
2454 self.report_download_page(query, pagenum)
2455 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2456 request = urllib2.Request(result_url)
2458 page = urllib2.urlopen(request).read()
2459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2463 # Extract video identifiers
2464 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2465 video_id = mobj.group(1)
2466 if video_id not in video_ids:
2467 video_ids.append(video_id)
2468 if len(video_ids) == n:
2469 # Specified n videos reached
2470 for id in video_ids:
2471 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2474 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2475 for id in video_ids:
2476 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2479 pagenum = pagenum + 1
2482 class YahooSearchIE(InfoExtractor):
2483 """Information Extractor for Yahoo! Video search queries."""
2484 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2485 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2486 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2487 _MORE_PAGES_INDICATOR = r'\s*Next'
2489 _max_yahoo_results = 1000
2490 IE_NAME = u'video.yahoo:search'
2492 def __init__(self, yahoo_ie, downloader=None):
2493 InfoExtractor.__init__(self, downloader)
2494 self._yahoo_ie = yahoo_ie
2496 def report_download_page(self, query, pagenum):
2497 """Report attempt to download playlist page with given number."""
2498 query = query.decode(preferredencoding())
2499 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2501 def _real_initialize(self):
2502 self._yahoo_ie.initialize()
2504 def _real_extract(self, query):
2505 mobj = re.match(self._VALID_URL, query)
2507 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2510 prefix, query = query.split(':')
2512 query = query.encode('utf-8')
2514 self._download_n_results(query, 1)
2516 elif prefix == 'all':
2517 self._download_n_results(query, self._max_yahoo_results)
2523 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2525 elif n > self._max_yahoo_results:
2526 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2527 n = self._max_yahoo_results
2528 self._download_n_results(query, n)
2530 except ValueError: # parsing prefix as integer fails
2531 self._download_n_results(query, 1)
2534 def _download_n_results(self, query, n):
2535 """Downloads a specified number of results for a query"""
2538 already_seen = set()
2542 self.report_download_page(query, pagenum)
2543 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2544 request = urllib2.Request(result_url)
2546 page = urllib2.urlopen(request).read()
2547 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2548 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2551 # Extract video identifiers
2552 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2553 video_id = mobj.group(1)
2554 if video_id not in already_seen:
2555 video_ids.append(video_id)
2556 already_seen.add(video_id)
2557 if len(video_ids) == n:
2558 # Specified n videos reached
2559 for id in video_ids:
2560 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2563 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2564 for id in video_ids:
2565 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2568 pagenum = pagenum + 1
2571 class YoutubePlaylistIE(InfoExtractor):
2572 """Information Extractor for YouTube playlists."""
2574 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2575 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2576 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2577 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2579 IE_NAME = u'youtube:playlist'
2581 def __init__(self, youtube_ie, downloader=None):
2582 InfoExtractor.__init__(self, downloader)
2583 self._youtube_ie = youtube_ie
2585 def report_download_page(self, playlist_id, pagenum):
2586 """Report attempt to download playlist page with given number."""
2587 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2589 def _real_initialize(self):
2590 self._youtube_ie.initialize()
2592 def _real_extract(self, url):
2593 # Extract playlist id
2594 mobj = re.match(self._VALID_URL, url)
2596 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2600 if mobj.group(3) is not None:
2601 self._youtube_ie.extract(mobj.group(3))
2604 # Download playlist pages
2605 # prefix is 'p' as default for playlists but there are other types that need extra care
2606 playlist_prefix = mobj.group(1)
2607 if playlist_prefix == 'a':
2608 playlist_access = 'artist'
2610 playlist_prefix = 'p'
2611 playlist_access = 'view_play_list'
2612 playlist_id = mobj.group(2)
2617 self.report_download_page(playlist_id, pagenum)
2618 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2619 request = urllib2.Request(url)
2621 page = urllib2.urlopen(request).read()
2622 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2623 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2626 # Extract video identifiers
2628 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2629 if mobj.group(1) not in ids_in_page:
2630 ids_in_page.append(mobj.group(1))
2631 video_ids.extend(ids_in_page)
2633 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2635 pagenum = pagenum + 1
2637 playliststart = self._downloader.params.get('playliststart', 1) - 1
2638 playlistend = self._downloader.params.get('playlistend', -1)
2639 if playlistend == -1:
2640 video_ids = video_ids[playliststart:]
2642 video_ids = video_ids[playliststart:playlistend]
2644 for id in video_ids:
2645 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2649 class YoutubeUserIE(InfoExtractor):
2650 """Information Extractor for YouTube users."""
2652 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2653 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2654 _GDATA_PAGE_SIZE = 50
2655 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2656 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2658 IE_NAME = u'youtube:user'
2660 def __init__(self, youtube_ie, downloader=None):
2661 InfoExtractor.__init__(self, downloader)
2662 self._youtube_ie = youtube_ie
2664 def report_download_page(self, username, start_index):
2665 """Report attempt to download user page."""
2666 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2667 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2669 def _real_initialize(self):
2670 self._youtube_ie.initialize()
2672 def _real_extract(self, url):
2674 mobj = re.match(self._VALID_URL, url)
2676 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2679 username = mobj.group(1)
2681 # Download video ids using YouTube Data API. Result size per
2682 # query is limited (currently to 50 videos) so we need to query
2683 # page by page until there are no video ids - it means we got
2690 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2691 self.report_download_page(username, start_index)
2693 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2696 page = urllib2.urlopen(request).read()
2697 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2698 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2701 # Extract video identifiers
2704 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2705 if mobj.group(1) not in ids_in_page:
2706 ids_in_page.append(mobj.group(1))
2708 video_ids.extend(ids_in_page)
2710 # A little optimization - if current page is not
2711 # "full", ie. does not contain PAGE_SIZE video ids then
2712 # we can assume that this page is the last one - there
2713 # are no more ids on further pages - no need to query
2716 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2721 all_ids_count = len(video_ids)
2722 playliststart = self._downloader.params.get('playliststart', 1) - 1
2723 playlistend = self._downloader.params.get('playlistend', -1)
2725 if playlistend == -1:
2726 video_ids = video_ids[playliststart:]
2728 video_ids = video_ids[playliststart:playlistend]
2730 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2731 (username, all_ids_count, len(video_ids)))
2733 for video_id in video_ids:
2734 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2737 class DepositFilesIE(InfoExtractor):
2738 """Information extractor for depositfiles.com"""
2740 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2741 IE_NAME = u'DepositFiles'
2743 def __init__(self, downloader=None):
2744 InfoExtractor.__init__(self, downloader)
2746 def report_download_webpage(self, file_id):
2747 """Report webpage download."""
2748 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2750 def report_extraction(self, file_id):
2751 """Report information extraction."""
2752 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2754 def _real_extract(self, url):
2755 # At this point we have a new file
2756 self._downloader.increment_downloads()
2758 file_id = url.split('/')[-1]
2759 # Rebuild url in english locale
2760 url = 'http://depositfiles.com/en/files/' + file_id
2762 # Retrieve file webpage with 'Free download' button pressed
2763 free_download_indication = { 'gateway_result' : '1' }
2764 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2766 self.report_download_webpage(file_id)
2767 webpage = urllib2.urlopen(request).read()
2768 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2769 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2772 # Search for the real file URL
2773 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2774 if (mobj is None) or (mobj.group(1) is None):
2775 # Try to figure out reason of the error.
2776 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2777 if (mobj is not None) and (mobj.group(1) is not None):
2778 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2779 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2781 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2784 file_url = mobj.group(1)
2785 file_extension = os.path.splitext(file_url)[1][1:]
2787 # Search for file title
2788 mobj = re.search(r'<b title="(.*?)">', webpage)
2790 self._downloader.trouble(u'ERROR: unable to extract title')
2792 file_title = mobj.group(1).decode('utf-8')
2795 # Process file information
2796 self._downloader.process_info({
2797 'id': file_id.decode('utf-8'),
2798 'url': file_url.decode('utf-8'),
2800 'upload_date': u'NA',
2801 'title': file_title,
2802 'stitle': file_title,
2803 'ext': file_extension.decode('utf-8'),
2807 except UnavailableVideoError, err:
2808 self._downloader.trouble(u'ERROR: unable to download file')
2811 class FacebookIE(InfoExtractor):
2812 """Information Extractor for Facebook"""
2814 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2815 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2816 _NETRC_MACHINE = 'facebook'
2817 _available_formats = ['video', 'highqual', 'lowqual']
2818 _video_extensions = {
2823 IE_NAME = u'facebook'
2825 def __init__(self, downloader=None):
2826 InfoExtractor.__init__(self, downloader)
2828 def _reporter(self, message):
2829 """Add header and report message."""
2830 self._downloader.to_screen(u'[facebook] %s' % message)
2832 def report_login(self):
2833 """Report attempt to log in."""
2834 self._reporter(u'Logging in')
2836 def report_video_webpage_download(self, video_id):
2837 """Report attempt to download video webpage."""
2838 self._reporter(u'%s: Downloading video webpage' % video_id)
2840 def report_information_extraction(self, video_id):
2841 """Report attempt to extract video information."""
2842 self._reporter(u'%s: Extracting video information' % video_id)
2844 def _parse_page(self, video_webpage):
2845 """Extract video information from page"""
2847 data = {'title': r'\("video_title", "(.*?)"\)',
2848 'description': r'<div class="datawrap">(.*?)</div>',
2849 'owner': r'\("video_owner_name", "(.*?)"\)',
2850 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2853 for piece in data.keys():
2854 mobj = re.search(data[piece], video_webpage)
2855 if mobj is not None:
2856 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2860 for fmt in self._available_formats:
2861 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2862 if mobj is not None:
2863 # URL is in a Javascript segment inside an escaped Unicode format within
2864 # the generally utf-8 page
2865 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2866 video_info['video_urls'] = video_urls
2870 def _real_initialize(self):
2871 if self._downloader is None:
2876 downloader_params = self._downloader.params
2878 # Attempt to use provided username and password or .netrc data
2879 if downloader_params.get('username', None) is not None:
2880 useremail = downloader_params['username']
2881 password = downloader_params['password']
2882 elif downloader_params.get('usenetrc', False):
2884 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2885 if info is not None:
2889 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2890 except (IOError, netrc.NetrcParseError), err:
2891 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2894 if useremail is None:
2903 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2906 login_results = urllib2.urlopen(request).read()
2907 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2908 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2910 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2911 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2914 def _real_extract(self, url):
2915 mobj = re.match(self._VALID_URL, url)
2917 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2919 video_id = mobj.group('ID')
2922 self.report_video_webpage_download(video_id)
2923 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2925 page = urllib2.urlopen(request)
2926 video_webpage = page.read()
2927 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2928 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2931 # Start extracting information
2932 self.report_information_extraction(video_id)
2934 # Extract information
2935 video_info = self._parse_page(video_webpage)
2938 if 'owner' not in video_info:
2939 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2941 video_uploader = video_info['owner']
2944 if 'title' not in video_info:
2945 self._downloader.trouble(u'ERROR: unable to extract video title')
2947 video_title = video_info['title']
2948 video_title = video_title.decode('utf-8')
2949 video_title = sanitize_title(video_title)
2951 simple_title = _simplify_title(video_title)
2954 if 'thumbnail' not in video_info:
2955 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2956 video_thumbnail = ''
2958 video_thumbnail = video_info['thumbnail']
2962 if 'upload_date' in video_info:
2963 upload_time = video_info['upload_date']
2964 timetuple = email.utils.parsedate_tz(upload_time)
2965 if timetuple is not None:
2967 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2972 video_description = video_info.get('description', 'No description available.')
2974 url_map = video_info['video_urls']
2975 if len(url_map.keys()) > 0:
2976 # Decide which formats to download
2977 req_format = self._downloader.params.get('format', None)
2978 format_limit = self._downloader.params.get('format_limit', None)
2980 if format_limit is not None and format_limit in self._available_formats:
2981 format_list = self._available_formats[self._available_formats.index(format_limit):]
2983 format_list = self._available_formats
2984 existing_formats = [x for x in format_list if x in url_map]
2985 if len(existing_formats) == 0:
2986 self._downloader.trouble(u'ERROR: no known formats available for video')
2988 if req_format is None:
2989 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2990 elif req_format == 'worst':
2991 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2992 elif req_format == '-1':
2993 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2996 if req_format not in url_map:
2997 self._downloader.trouble(u'ERROR: requested format not available')
2999 video_url_list = [(req_format, url_map[req_format])] # Specific format
3001 for format_param, video_real_url in video_url_list:
3003 # At this point we have a new video
3004 self._downloader.increment_downloads()
3007 video_extension = self._video_extensions.get(format_param, 'mp4')
3010 # Process video information
3011 self._downloader.process_info({
3012 'id': video_id.decode('utf-8'),
3013 'url': video_real_url.decode('utf-8'),
3014 'uploader': video_uploader.decode('utf-8'),
3015 'upload_date': upload_date,
3016 'title': video_title,
3017 'stitle': simple_title,
3018 'ext': video_extension.decode('utf-8'),
3019 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3020 'thumbnail': video_thumbnail.decode('utf-8'),
3021 'description': video_description.decode('utf-8'),
3024 except UnavailableVideoError, err:
3025 self._downloader.trouble(u'\nERROR: unable to download video')
3027 class BlipTVIE(InfoExtractor):
3028 """Information extractor for blip.tv"""
3030 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3031 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3032 IE_NAME = u'blip.tv'
3034 def report_extraction(self, file_id):
3035 """Report information extraction."""
3036 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3038 def report_direct_download(self, title):
3039 """Report information extraction."""
3040 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3042 def _real_extract(self, url):
3043 mobj = re.match(self._VALID_URL, url)
3045 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3052 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3053 request = urllib2.Request(json_url)
3054 self.report_extraction(mobj.group(1))
3057 urlh = urllib2.urlopen(request)
3058 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3059 basename = url.split('/')[-1]
3060 title,ext = os.path.splitext(basename)
3061 title = title.decode('UTF-8')
3062 ext = ext.replace('.', '')
3063 self.report_direct_download(title)
3068 'stitle': _simplify_title(title),
3072 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3073 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3075 if info is None: # Regular URL
3077 json_code = urlh.read()
3078 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3079 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3083 json_data = json.loads(json_code)
3084 if 'Post' in json_data:
3085 data = json_data['Post']
3089 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3090 video_url = data['media']['url']
3091 umobj = re.match(self._URL_EXT, video_url)
3093 raise ValueError('Can not determine filename extension')
3094 ext = umobj.group(1)
3097 'id': data['item_id'],
3099 'uploader': data['display_name'],
3100 'upload_date': upload_date,
3101 'title': data['title'],
3102 'stitle': _simplify_title(data['title']),
3104 'format': data['media']['mimeType'],
3105 'thumbnail': data['thumbnailUrl'],
3106 'description': data['description'],
3107 'player_url': data['embedUrl']
3109 except (ValueError,KeyError), err:
3110 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3113 self._downloader.increment_downloads()
3116 self._downloader.process_info(info)
3117 except UnavailableVideoError, err:
3118 self._downloader.trouble(u'\nERROR: unable to download video')
3121 class MyVideoIE(InfoExtractor):
3122 """Information Extractor for myvideo.de."""
3124 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3125 IE_NAME = u'myvideo'
3127 def __init__(self, downloader=None):
3128 InfoExtractor.__init__(self, downloader)
3130 def report_download_webpage(self, video_id):
3131 """Report webpage download."""
3132 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3134 def report_extraction(self, video_id):
3135 """Report information extraction."""
3136 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3138 def _real_extract(self,url):
3139 mobj = re.match(self._VALID_URL, url)
3141 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3144 video_id = mobj.group(1)
3147 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3149 self.report_download_webpage(video_id)
3150 webpage = urllib2.urlopen(request).read()
3151 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3152 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3155 self.report_extraction(video_id)
3156 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3159 self._downloader.trouble(u'ERROR: unable to extract media URL')
3161 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3163 mobj = re.search('<title>([^<]+)</title>', webpage)
3165 self._downloader.trouble(u'ERROR: unable to extract title')
3168 video_title = mobj.group(1)
3169 video_title = sanitize_title(video_title)
3171 simple_title = _simplify_title(video_title)
3174 self._downloader.process_info({
3178 'upload_date': u'NA',
3179 'title': video_title,
3180 'stitle': simple_title,
3185 except UnavailableVideoError:
3186 self._downloader.trouble(u'\nERROR: Unable to download video')
3188 class ComedyCentralIE(InfoExtractor):
3189 """Information extractor for The Daily Show and Colbert Report """
3191 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3192 IE_NAME = u'comedycentral'
3194 def report_extraction(self, episode_id):
3195 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3197 def report_config_download(self, episode_id):
3198 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3200 def report_index_download(self, episode_id):
3201 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3203 def report_player_url(self, episode_id):
3204 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3206 def _real_extract(self, url):
3207 mobj = re.match(self._VALID_URL, url)
3209 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3212 if mobj.group('shortname'):
3213 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3214 url = u'http://www.thedailyshow.com/full-episodes/'
3216 url = u'http://www.colbertnation.com/full-episodes/'
3217 mobj = re.match(self._VALID_URL, url)
3218 assert mobj is not None
3220 dlNewest = not mobj.group('episode')
3222 epTitle = mobj.group('showname')
3224 epTitle = mobj.group('episode')
3226 req = urllib2.Request(url)
3227 self.report_extraction(epTitle)
3229 htmlHandle = urllib2.urlopen(req)
3230 html = htmlHandle.read()
3231 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3232 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3235 url = htmlHandle.geturl()
3236 mobj = re.match(self._VALID_URL, url)
3238 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3240 if mobj.group('episode') == '':
3241 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3243 epTitle = mobj.group('episode')
3245 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3246 if len(mMovieParams) == 0:
3247 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3250 playerUrl_raw = mMovieParams[0][0]
3251 self.report_player_url(epTitle)
3253 urlHandle = urllib2.urlopen(playerUrl_raw)
3254 playerUrl = urlHandle.geturl()
3255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3256 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3259 uri = mMovieParams[0][1]
3260 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3261 self.report_index_download(epTitle)
3263 indexXml = urllib2.urlopen(indexUrl).read()
3264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3265 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3268 idoc = xml.etree.ElementTree.fromstring(indexXml)
3269 itemEls = idoc.findall('.//item')
3270 for itemEl in itemEls:
3271 mediaId = itemEl.findall('./guid')[0].text
3272 shortMediaId = mediaId.split(':')[-1]
3273 showId = mediaId.split(':')[-2].replace('.com', '')
3274 officialTitle = itemEl.findall('./title')[0].text
3275 officialDate = itemEl.findall('./pubDate')[0].text
3277 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3278 urllib.urlencode({'uri': mediaId}))
3279 configReq = urllib2.Request(configUrl)
3280 self.report_config_download(epTitle)
3282 configXml = urllib2.urlopen(configReq).read()
3283 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3284 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3287 cdoc = xml.etree.ElementTree.fromstring(configXml)
3289 for rendition in cdoc.findall('.//rendition'):
3290 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3294 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3297 # For now, just pick the highest bitrate
3298 format,video_url = turls[-1]
3300 self._downloader.increment_downloads()
3302 effTitle = showId + u'-' + epTitle
3307 'upload_date': officialDate,
3309 'stitle': _simplify_title(effTitle),
3313 'description': officialTitle,
3314 'player_url': playerUrl
3318 self._downloader.process_info(info)
3319 except UnavailableVideoError, err:
3320 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3324 class EscapistIE(InfoExtractor):
3325 """Information extractor for The Escapist """
3327 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3328 IE_NAME = u'escapist'
3330 def report_extraction(self, showName):
3331 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3333 def report_config_download(self, showName):
3334 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3336 def _real_extract(self, url):
3337 htmlParser = HTMLParser.HTMLParser()
3339 mobj = re.match(self._VALID_URL, url)
3341 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3343 showName = mobj.group('showname')
3344 videoId = mobj.group('episode')
3346 self.report_extraction(showName)
3348 webPage = urllib2.urlopen(url).read()
3349 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3350 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3353 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3354 description = htmlParser.unescape(descMatch.group(1))
3355 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3356 imgUrl = htmlParser.unescape(imgMatch.group(1))
3357 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3358 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3359 configUrlMatch = re.search('config=(.*)$', playerUrl)
3360 configUrl = urllib2.unquote(configUrlMatch.group(1))
3362 self.report_config_download(showName)
3364 configJSON = urllib2.urlopen(configUrl).read()
3365 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3366 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3369 # Technically, it's JavaScript, not JSON
3370 configJSON = configJSON.replace("'", '"')
3373 config = json.loads(configJSON)
3374 except (ValueError,), err:
3375 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3378 playlist = config['playlist']
3379 videoUrl = playlist[1]['url']
3381 self._downloader.increment_downloads()
3385 'uploader': showName,
3386 'upload_date': None,
3388 'stitle': _simplify_title(showName),
3391 'thumbnail': imgUrl,
3392 'description': description,
3393 'player_url': playerUrl,
3397 self._downloader.process_info(info)
3398 except UnavailableVideoError, err:
3399 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3402 class CollegeHumorIE(InfoExtractor):
3403 """Information extractor for collegehumor.com"""
3405 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3406 IE_NAME = u'collegehumor'
3408 def report_webpage(self, video_id):
3409 """Report information extraction."""
3410 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3412 def report_extraction(self, video_id):
3413 """Report information extraction."""
3414 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3416 def _real_extract(self, url):
3417 htmlParser = HTMLParser.HTMLParser()
3419 mobj = re.match(self._VALID_URL, url)
3421 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3423 video_id = mobj.group('videoid')
3425 self.report_webpage(video_id)
3426 request = urllib2.Request(url)
3428 webpage = urllib2.urlopen(request).read()
3429 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3430 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3433 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3435 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3437 internal_video_id = m.group('internalvideoid')
3441 'internal_id': internal_video_id,
3444 self.report_extraction(video_id)
3445 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3447 metaXml = urllib2.urlopen(xmlUrl).read()
3448 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3449 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3452 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3454 videoNode = mdoc.findall('./video')[0]
3455 info['description'] = videoNode.findall('./description')[0].text
3456 info['title'] = videoNode.findall('./caption')[0].text
3457 info['stitle'] = _simplify_title(info['title'])
3458 info['url'] = videoNode.findall('./file')[0].text
3459 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3460 info['ext'] = info['url'].rpartition('.')[2]
3461 info['format'] = info['ext']
3463 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3466 self._downloader.increment_downloads()
3469 self._downloader.process_info(info)
3470 except UnavailableVideoError, err:
3471 self._downloader.trouble(u'\nERROR: unable to download video')
3474 class XVideosIE(InfoExtractor):
3475 """Information extractor for xvideos.com"""
3477 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3478 IE_NAME = u'xvideos'
3480 def report_webpage(self, video_id):
3481 """Report information extraction."""
3482 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3484 def report_extraction(self, video_id):
3485 """Report information extraction."""
3486 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3488 def _real_extract(self, url):
3489 htmlParser = HTMLParser.HTMLParser()
3491 mobj = re.match(self._VALID_URL, url)
3493 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3495 video_id = mobj.group(1).decode('utf-8')
3497 self.report_webpage(video_id)
3499 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3501 webpage = urllib2.urlopen(request).read()
3502 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3503 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3506 self.report_extraction(video_id)
3510 mobj = re.search(r'flv_url=(.+?)&', webpage)
3512 self._downloader.trouble(u'ERROR: unable to extract video url')
3514 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3518 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3520 self._downloader.trouble(u'ERROR: unable to extract video title')
3522 video_title = mobj.group(1).decode('utf-8')
3525 # Extract video thumbnail
3526 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3528 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3530 video_thumbnail = mobj.group(1).decode('utf-8')
3534 self._downloader.increment_downloads()
3539 'upload_date': None,
3540 'title': video_title,
3541 'stitle': _simplify_title(video_title),
3544 'thumbnail': video_thumbnail,
3545 'description': None,
3550 self._downloader.process_info(info)
3551 except UnavailableVideoError, err:
3552 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3555 class SoundcloudIE(InfoExtractor):
3556 """Information extractor for soundcloud.com
3557 To access the media, the uid of the song and a stream token
3558 must be extracted from the page source and the script must make
3559 a request to media.soundcloud.com/crossdomain.xml. Then
3560 the media can be grabbed by requesting from an url composed
3561 of the stream token and uid
3564 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3565 IE_NAME = u'soundcloud'
3567 def __init__(self, downloader=None):
3568 InfoExtractor.__init__(self, downloader)
3570 def report_webpage(self, video_id):
3571 """Report information extraction."""
3572 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3574 def report_extraction(self, video_id):
3575 """Report information extraction."""
3576 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3578 def _real_extract(self, url):
3579 htmlParser = HTMLParser.HTMLParser()
3581 mobj = re.match(self._VALID_URL, url)
3583 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3586 # extract uploader (which is in the url)
3587 uploader = mobj.group(1).decode('utf-8')
3588 # extract simple title (uploader + slug of song title)
3589 slug_title = mobj.group(2).decode('utf-8')
3590 simple_title = uploader + '-' + slug_title
3592 self.report_webpage('%s/%s' % (uploader, slug_title))
3594 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3596 webpage = urllib2.urlopen(request).read()
3597 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3598 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3601 self.report_extraction('%s/%s' % (uploader, slug_title))
3603 # extract uid and stream token that soundcloud hands out for access
3604 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3606 video_id = mobj.group(1)
3607 stream_token = mobj.group(2)
3609 # extract unsimplified title
3610 mobj = re.search('"title":"(.*?)",', webpage)
3612 title = mobj.group(1)
3614 # construct media url (with uid/token)
3615 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3616 mediaURL = mediaURL % (video_id, stream_token)
3619 description = u'No description available'
3620 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3622 description = mobj.group(1)
3626 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3629 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3630 except Exception, e:
3633 # for soundcloud, a request to a cross domain is required for cookies
3634 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3637 self._downloader.process_info({
3638 'id': video_id.decode('utf-8'),
3640 'uploader': uploader.decode('utf-8'),
3641 'upload_date': upload_date,
3642 'title': simple_title.decode('utf-8'),
3643 'stitle': simple_title.decode('utf-8'),
3647 'description': description.decode('utf-8')
3649 except UnavailableVideoError:
3650 self._downloader.trouble(u'\nERROR: unable to download video')
3653 class InfoQIE(InfoExtractor):
3654 """Information extractor for infoq.com"""
3656 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3659 def report_webpage(self, video_id):
3660 """Report information extraction."""
3661 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3663 def report_extraction(self, video_id):
3664 """Report information extraction."""
3665 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3667 def _real_extract(self, url):
3668 htmlParser = HTMLParser.HTMLParser()
3670 mobj = re.match(self._VALID_URL, url)
3672 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3675 self.report_webpage(url)
3677 request = urllib2.Request(url)
3679 webpage = urllib2.urlopen(request).read()
3680 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3681 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3684 self.report_extraction(url)
3688 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3690 self._downloader.trouble(u'ERROR: unable to extract video url')
3692 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3696 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3698 self._downloader.trouble(u'ERROR: unable to extract video title')
3700 video_title = mobj.group(1).decode('utf-8')
3702 # Extract description
3703 video_description = u'No description available.'
3704 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3705 if mobj is not None:
3706 video_description = mobj.group(1).decode('utf-8')
3708 video_filename = video_url.split('/')[-1]
3709 video_id, extension = video_filename.split('.')
3711 self._downloader.increment_downloads()
3716 'upload_date': None,
3717 'title': video_title,
3718 'stitle': _simplify_title(video_title),
3720 'format': extension, # Extension is always(?) mp4, but seems to be flv
3722 'description': video_description,
3727 self._downloader.process_info(info)
3728 except UnavailableVideoError, err:
3729 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3731 class MixcloudIE(InfoExtractor):
3732 """Information extractor for www.mixcloud.com"""
3733 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3734 IE_NAME = u'mixcloud'
3736 def __init__(self, downloader=None):
3737 InfoExtractor.__init__(self, downloader)
3739 def report_download_json(self, file_id):
3740 """Report JSON download."""
3741 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3743 def report_extraction(self, file_id):
3744 """Report information extraction."""
3745 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3747 def get_urls(self, jsonData, fmt, bitrate='best'):
3748 """Get urls from 'audio_formats' section in json"""
3751 bitrate_list = jsonData[fmt]
3752 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3753 bitrate = max(bitrate_list) # select highest
3755 url_list = jsonData[fmt][bitrate]
3756 except TypeError: # we have no bitrate info.
3757 url_list = jsonData[fmt]
3761 def check_urls(self, url_list):
3762 """Returns 1st active url from list"""
3763 for url in url_list:
3765 urllib2.urlopen(url)
3767 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3772 def _print_formats(self, formats):
3773 print 'Available formats:'
3774 for fmt in formats.keys():
3775 for b in formats[fmt]:
3777 ext = formats[fmt][b][0]
3778 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3779 except TypeError: # we have no bitrate info
3780 ext = formats[fmt][0]
3781 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3784 def _real_extract(self, url):
3785 mobj = re.match(self._VALID_URL, url)
3787 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3789 # extract uploader & filename from url
3790 uploader = mobj.group(1).decode('utf-8')
3791 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3793 # construct API request
3794 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3795 # retrieve .json file with links to files
3796 request = urllib2.Request(file_url)
3798 self.report_download_json(file_url)
3799 jsonData = urllib2.urlopen(request).read()
3800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3801 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3805 json_data = json.loads(jsonData)
3806 player_url = json_data['player_swf_url']
3807 formats = dict(json_data['audio_formats'])
3809 req_format = self._downloader.params.get('format', None)
3812 if self._downloader.params.get('listformats', None):
3813 self._print_formats(formats)
3816 if req_format is None or req_format == 'best':
3817 for format_param in formats.keys():
3818 url_list = self.get_urls(formats, format_param)
3820 file_url = self.check_urls(url_list)
3821 if file_url is not None:
3824 if req_format not in formats.keys():
3825 self._downloader.trouble(u'ERROR: format is not available')
3828 url_list = self.get_urls(formats, req_format)
3829 file_url = self.check_urls(url_list)
3830 format_param = req_format
3833 self._downloader.increment_downloads()
3835 # Process file information
3836 self._downloader.process_info({
3837 'id': file_id.decode('utf-8'),
3838 'url': file_url.decode('utf-8'),
3839 'uploader': uploader.decode('utf-8'),
3840 'upload_date': u'NA',
3841 'title': json_data['name'],
3842 'stitle': _simplify_title(json_data['name']),
3843 'ext': file_url.split('.')[-1].decode('utf-8'),
3844 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3845 'thumbnail': json_data['thumbnail_url'],
3846 'description': json_data['description'],
3847 'player_url': player_url.decode('utf-8'),
3849 except UnavailableVideoError, err:
3850 self._downloader.trouble(u'ERROR: unable to download file')
3852 class StanfordOpenClassroomIE(InfoExtractor):
3853 """Information extractor for Stanford's Open ClassRoom"""
3855 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3856 IE_NAME = u'stanfordoc'
3858 def report_download_webpage(self, objid):
3859 """Report information extraction."""
3860 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3862 def report_extraction(self, video_id):
3863 """Report information extraction."""
3864 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3866 def _real_extract(self, url):
3867 mobj = re.match(self._VALID_URL, url)
3869 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3872 if mobj.group('course') and mobj.group('video'): # A specific video
3873 course = mobj.group('course')
3874 video = mobj.group('video')
3876 'id': _simplify_title(course + '_' + video),
3879 self.report_extraction(info['id'])
3880 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3881 xmlUrl = baseUrl + video + '.xml'
3883 metaXml = urllib2.urlopen(xmlUrl).read()
3884 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3885 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3887 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3889 info['title'] = mdoc.findall('./title')[0].text
3890 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3892 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3894 info['stitle'] = _simplify_title(info['title'])
3895 info['ext'] = info['url'].rpartition('.')[2]
3896 info['format'] = info['ext']
3897 self._downloader.increment_downloads()
3899 self._downloader.process_info(info)
3900 except UnavailableVideoError, err:
3901 self._downloader.trouble(u'\nERROR: unable to download video')
3902 elif mobj.group('course'): # A course page
3903 unescapeHTML = HTMLParser.HTMLParser().unescape
3905 course = mobj.group('course')
3907 'id': _simplify_title(course),
3911 self.report_download_webpage(info['id'])
3913 coursepage = urllib2.urlopen(url).read()
3914 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3915 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3918 m = re.search('<h1>([^<]+)</h1>', coursepage)
3920 info['title'] = unescapeHTML(m.group(1))
3922 info['title'] = info['id']
3923 info['stitle'] = _simplify_title(info['title'])
3925 m = re.search('<description>([^<]+)</description>', coursepage)
3927 info['description'] = unescapeHTML(m.group(1))
3929 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3932 'type': 'reference',
3933 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3937 for entry in info['list']:
3938 assert entry['type'] == 'reference'
3939 self.extract(entry['url'])
3941 unescapeHTML = HTMLParser.HTMLParser().unescape
3944 'id': 'Stanford OpenClassroom',
3948 self.report_download_webpage(info['id'])
3949 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3951 rootpage = urllib2.urlopen(rootURL).read()
3952 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3953 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3956 info['title'] = info['id']
3957 info['stitle'] = _simplify_title(info['title'])
3959 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3962 'type': 'reference',
3963 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3967 for entry in info['list']:
3968 assert entry['type'] == 'reference'
3969 self.extract(entry['url'])
3971 class MTVIE(InfoExtractor):
3972 """Information extractor for MTV.com"""
3974 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3977 def report_webpage(self, video_id):
3978 """Report information extraction."""
3979 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3981 def report_extraction(self, video_id):
3982 """Report information extraction."""
3983 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3985 def _real_extract(self, url):
3986 mobj = re.match(self._VALID_URL, url)
3988 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3990 if not mobj.group('proto'):
3991 url = 'http://' + url
3992 video_id = mobj.group('videoid')
3993 self.report_webpage(video_id)
3995 request = urllib2.Request(url)
3997 webpage = urllib2.urlopen(request).read()
3998 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3999 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4002 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4004 self._downloader.trouble(u'ERROR: unable to extract song name')
4006 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4007 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4009 self._downloader.trouble(u'ERROR: unable to extract performer')
4011 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4012 video_title = performer + ' - ' + song_name
4014 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4016 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4018 mtvn_uri = mobj.group(1)
4020 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4022 self._downloader.trouble(u'ERROR: unable to extract content id')
4024 content_id = mobj.group(1)
4026 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4027 self.report_extraction(video_id)
4028 request = urllib2.Request(videogen_url)
4030 metadataXml = urllib2.urlopen(request).read()
4031 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4032 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4035 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4036 renditions = mdoc.findall('.//rendition')
4038 # For now, always pick the highest quality.
4039 rendition = renditions[-1]
4042 _,_,ext = rendition.attrib['type'].partition('/')
4043 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4044 video_url = rendition.find('./src').text
4046 self._downloader.trouble('Invalid rendition field.')
4049 self._downloader.increment_downloads()
4053 'uploader': performer,
4054 'title': video_title,
4055 'stitle': _simplify_title(video_title),
4061 self._downloader.process_info(info)
4062 except UnavailableVideoError, err:
4063 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4066 class PostProcessor(object):
4067 """Post Processor class.
4069 PostProcessor objects can be added to downloaders with their
4070 add_post_processor() method. When the downloader has finished a
4071 successful download, it will take its internal chain of PostProcessors
4072 and start calling the run() method on each one of them, first with
4073 an initial argument and then with the returned value of the previous
4076 The chain will be stopped if one of them ever returns None or the end
4077 of the chain is reached.
4079 PostProcessor objects follow a "mutual registration" process similar
4080 to InfoExtractor objects.
4085 def __init__(self, downloader=None):
4086 self._downloader = downloader
4088 def set_downloader(self, downloader):
4089 """Sets the downloader for this PP."""
4090 self._downloader = downloader
4092 def run(self, information):
4093 """Run the PostProcessor.
4095 The "information" argument is a dictionary like the ones
4096 composed by InfoExtractors. The only difference is that this
4097 one has an extra field called "filepath" that points to the
4100 When this method returns None, the postprocessing chain is
4101 stopped. However, this method may return an information
4102 dictionary that will be passed to the next postprocessing
4103 object in the chain. It can be the one it received after
4104 changing some fields.
4106 In addition, this method may raise a PostProcessingError
4107 exception that will be taken into account by the downloader
4110 return information # by default, do nothing
4112 class AudioConversionError(BaseException):
4113 def __init__(self, message):
4114 self.message = message
4116 class FFmpegExtractAudioPP(PostProcessor):
4118 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4119 PostProcessor.__init__(self, downloader)
4120 if preferredcodec is None:
4121 preferredcodec = 'best'
4122 self._preferredcodec = preferredcodec
4123 self._preferredquality = preferredquality
4124 self._keepvideo = keepvideo
4127 def get_audio_codec(path):
4129 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4130 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4131 output = handle.communicate()[0]
4132 if handle.wait() != 0:
4134 except (IOError, OSError):
4137 for line in output.split('\n'):
4138 if line.startswith('codec_name='):
4139 audio_codec = line.split('=')[1].strip()
4140 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4145 def run_ffmpeg(path, out_path, codec, more_opts):
4149 acodec_opts = ['-acodec', codec]
4150 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4152 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4153 stdout,stderr = p.communicate()
4154 except (IOError, OSError):
4155 e = sys.exc_info()[1]
4156 if isinstance(e, OSError) and e.errno == 2:
4157 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4160 if p.returncode != 0:
4161 msg = stderr.strip().split('\n')[-1]
4162 raise AudioConversionError(msg)
4164 def run(self, information):
4165 path = information['filepath']
4167 filecodec = self.get_audio_codec(path)
4168 if filecodec is None:
4169 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4173 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4174 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4175 # Lossless, but in another container
4177 extension = self._preferredcodec
4178 more_opts = ['-absf', 'aac_adtstoasc']
4179 elif filecodec in ['aac', 'mp3', 'vorbis']:
4180 # Lossless if possible
4182 extension = filecodec
4183 if filecodec == 'aac':
4184 more_opts = ['-f', 'adts']
4185 if filecodec == 'vorbis':
4189 acodec = 'libmp3lame'
4192 if self._preferredquality is not None:
4193 more_opts += ['-ab', self._preferredquality]
4195 # We convert the audio (lossy)
4196 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4197 extension = self._preferredcodec
4199 if self._preferredquality is not None:
4200 more_opts += ['-ab', self._preferredquality]
4201 if self._preferredcodec == 'aac':
4202 more_opts += ['-f', 'adts']
4203 if self._preferredcodec == 'm4a':
4204 more_opts += ['-absf', 'aac_adtstoasc']
4205 if self._preferredcodec == 'vorbis':
4207 if self._preferredcodec == 'wav':
4209 more_opts += ['-f', 'wav']
4211 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4212 new_path = prefix + sep + extension
4213 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4215 self.run_ffmpeg(path, new_path, acodec, more_opts)
4217 etype,e,tb = sys.exc_info()
4218 if isinstance(e, AudioConversionError):
4219 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4221 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4224 # Try to update the date time for extracted audio file.
4225 if information.get('filetime') is not None:
4227 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4229 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4231 if not self._keepvideo:
4233 os.remove(_encodeFilename(path))
4234 except (IOError, OSError):
4235 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4238 information['filepath'] = new_path
4242 def updateSelf(downloader, filename):
4243 ''' Update the program file with the latest version from the repository '''
4244 # Note: downloader only used for options
4245 if not os.access(filename, os.W_OK):
4246 sys.exit('ERROR: no write permissions on %s' % filename)
4248 downloader.to_screen(u'Updating to latest version...')
4252 urlh = urllib.urlopen(UPDATE_URL)
4253 newcontent = urlh.read()
4255 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4256 if vmatch is not None and vmatch.group(1) == __version__:
4257 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4261 except (IOError, OSError), err:
4262 sys.exit('ERROR: unable to download latest version')
4265 outf = open(filename, 'wb')
4267 outf.write(newcontent)
4270 except (IOError, OSError), err:
4271 sys.exit('ERROR: unable to overwrite current version')
4273 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4276 def _readOptions(filename_bytes):
4278 optionf = open(filename_bytes)
4280 return [] # silently skip if file is not present
4284 res += shlex.split(l, comments=True)
4289 def _format_option_string(option):
4290 ''' ('-o', '--option') -> -o, --format METAVAR'''
4294 if option._short_opts: opts.append(option._short_opts[0])
4295 if option._long_opts: opts.append(option._long_opts[0])
4296 if len(opts) > 1: opts.insert(1, ', ')
4298 if option.takes_value(): opts.append(' %s' % option.metavar)
4300 return "".join(opts)
4302 def _find_term_columns():
4303 columns = os.environ.get('COLUMNS', None)
4308 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4309 out,err = sp.communicate()
4310 return int(out.split()[1])
4316 max_help_position = 80
4318 # No need to wrap help messages if we're on a wide console
4319 columns = _find_term_columns()
4320 if columns: max_width = columns
4322 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4323 fmt.format_option_strings = _format_option_string
4326 'version' : __version__,
4328 'usage' : '%prog [options] url [url...]',
4329 'conflict_handler' : 'resolve',
4332 parser = optparse.OptionParser(**kw)
4335 general = optparse.OptionGroup(parser, 'General Options')
4336 selection = optparse.OptionGroup(parser, 'Video Selection')
4337 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4338 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4339 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4340 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4341 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4343 general.add_option('-h', '--help',
4344 action='help', help='print this help text and exit')
4345 general.add_option('-v', '--version',
4346 action='version', help='print program version and exit')
4347 general.add_option('-U', '--update',
4348 action='store_true', dest='update_self', help='update this program to latest version')
4349 general.add_option('-i', '--ignore-errors',
4350 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4351 general.add_option('-r', '--rate-limit',
4352 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4353 general.add_option('-R', '--retries',
4354 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4355 general.add_option('--dump-user-agent',
4356 action='store_true', dest='dump_user_agent',
4357 help='display the current browser identification', default=False)
4358 general.add_option('--list-extractors',
4359 action='store_true', dest='list_extractors',
4360 help='List all supported extractors and the URLs they would handle', default=False)
4362 selection.add_option('--playlist-start',
4363 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4364 selection.add_option('--playlist-end',
4365 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4366 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4367 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4368 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4370 authentication.add_option('-u', '--username',
4371 dest='username', metavar='USERNAME', help='account username')
4372 authentication.add_option('-p', '--password',
4373 dest='password', metavar='PASSWORD', help='account password')
4374 authentication.add_option('-n', '--netrc',
4375 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4378 video_format.add_option('-f', '--format',
4379 action='store', dest='format', metavar='FORMAT', help='video format code')
4380 video_format.add_option('--all-formats',
4381 action='store_const', dest='format', help='download all available video formats', const='all')
4382 video_format.add_option('--prefer-free-formats',
4383 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4384 video_format.add_option('--max-quality',
4385 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4386 video_format.add_option('-F', '--list-formats',
4387 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4390 verbosity.add_option('-q', '--quiet',
4391 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4392 verbosity.add_option('-s', '--simulate',
4393 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4394 verbosity.add_option('--skip-download',
4395 action='store_true', dest='skip_download', help='do not download the video', default=False)
4396 verbosity.add_option('-g', '--get-url',
4397 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4398 verbosity.add_option('-e', '--get-title',
4399 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4400 verbosity.add_option('--get-thumbnail',
4401 action='store_true', dest='getthumbnail',
4402 help='simulate, quiet but print thumbnail URL', default=False)
4403 verbosity.add_option('--get-description',
4404 action='store_true', dest='getdescription',
4405 help='simulate, quiet but print video description', default=False)
4406 verbosity.add_option('--get-filename',
4407 action='store_true', dest='getfilename',
4408 help='simulate, quiet but print output filename', default=False)
4409 verbosity.add_option('--get-format',
4410 action='store_true', dest='getformat',
4411 help='simulate, quiet but print output format', default=False)
4412 verbosity.add_option('--no-progress',
4413 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4414 verbosity.add_option('--console-title',
4415 action='store_true', dest='consoletitle',
4416 help='display progress in console titlebar', default=False)
4417 verbosity.add_option('-v', '--verbose',
4418 action='store_true', dest='verbose', help='print various debugging information', default=False)
4421 filesystem.add_option('-t', '--title',
4422 action='store_true', dest='usetitle', help='use title in file name', default=False)
4423 filesystem.add_option('-l', '--literal',
4424 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4425 filesystem.add_option('-A', '--auto-number',
4426 action='store_true', dest='autonumber',
4427 help='number downloaded files starting from 00000', default=False)
4428 filesystem.add_option('-o', '--output',
4429 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4430 filesystem.add_option('-a', '--batch-file',
4431 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4432 filesystem.add_option('-w', '--no-overwrites',
4433 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4434 filesystem.add_option('-c', '--continue',
4435 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4436 filesystem.add_option('--no-continue',
4437 action='store_false', dest='continue_dl',
4438 help='do not resume partially downloaded files (restart from beginning)')
4439 filesystem.add_option('--cookies',
4440 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4441 filesystem.add_option('--no-part',
4442 action='store_true', dest='nopart', help='do not use .part files', default=False)
4443 filesystem.add_option('--no-mtime',
4444 action='store_false', dest='updatetime',
4445 help='do not use the Last-modified header to set the file modification time', default=True)
4446 filesystem.add_option('--write-description',
4447 action='store_true', dest='writedescription',
4448 help='write video description to a .description file', default=False)
4449 filesystem.add_option('--write-info-json',
4450 action='store_true', dest='writeinfojson',
4451 help='write video metadata to a .info.json file', default=False)
4452 filesystem.add_option('--write-srt',
4453 action='store_true', dest='writesubtitles',
4454 help='write video subtitles to a .srt file', default=False)
4457 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4458 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4459 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4460 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4461 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4462 help='ffmpeg audio bitrate specification, 128k by default')
4463 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4464 help='keeps the video file on disk after the post-processing; the video is erased by default')
4467 parser.add_option_group(general)
4468 parser.add_option_group(selection)
4469 parser.add_option_group(filesystem)
4470 parser.add_option_group(verbosity)
4471 parser.add_option_group(video_format)
4472 parser.add_option_group(authentication)
4473 parser.add_option_group(postproc)
4475 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4477 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4479 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4480 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4481 opts, args = parser.parse_args(argv)
4483 return parser, opts, args
4485 def gen_extractors():
4486 """ Return a list of an instance of every supported extractor.
4487 The order does matter; the first extractor matched is the one handling the URL.
4489 youtube_ie = YoutubeIE()
4490 google_ie = GoogleIE()
4491 yahoo_ie = YahooIE()
4493 YoutubePlaylistIE(youtube_ie),
4494 YoutubeUserIE(youtube_ie),
4495 YoutubeSearchIE(youtube_ie),
4497 MetacafeIE(youtube_ie),
4500 GoogleSearchIE(google_ie),
4503 YahooSearchIE(yahoo_ie),
4516 StanfordOpenClassroomIE(),
4523 parser, opts, args = parseOpts()
4525 # Open appropriate CookieJar
4526 if opts.cookiefile is None:
4527 jar = cookielib.CookieJar()
4530 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4531 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4533 except (IOError, OSError), err:
4534 sys.exit(u'ERROR: unable to open cookie file')
4537 if opts.dump_user_agent:
4538 print std_headers['User-Agent']
4541 # Batch file verification
4543 if opts.batchfile is not None:
4545 if opts.batchfile == '-':
4548 batchfd = open(opts.batchfile, 'r')
4549 batchurls = batchfd.readlines()
4550 batchurls = [x.strip() for x in batchurls]
4551 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4553 sys.exit(u'ERROR: batch file could not be read')
4554 all_urls = batchurls + args
4556 # General configuration
4557 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4558 proxy_handler = urllib2.ProxyHandler()
4559 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4560 urllib2.install_opener(opener)
4561 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4564 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4566 extractors = gen_extractors()
4568 if opts.list_extractors:
4569 for ie in extractors:
4571 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4572 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4573 for mu in matchedUrls:
4577 # Conflicting, missing and erroneous options
4578 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4579 parser.error(u'using .netrc conflicts with giving username/password')
4580 if opts.password is not None and opts.username is None:
4581 parser.error(u'account username missing')
4582 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4583 parser.error(u'using output template conflicts with using title, literal title or auto number')
4584 if opts.usetitle and opts.useliteral:
4585 parser.error(u'using title conflicts with using literal title')
4586 if opts.username is not None and opts.password is None:
4587 opts.password = getpass.getpass(u'Type account password and press return:')
4588 if opts.ratelimit is not None:
4589 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4590 if numeric_limit is None:
4591 parser.error(u'invalid rate limit specified')
4592 opts.ratelimit = numeric_limit
4593 if opts.retries is not None:
4595 opts.retries = long(opts.retries)
4596 except (TypeError, ValueError), err:
4597 parser.error(u'invalid retry count specified')
4599 opts.playliststart = int(opts.playliststart)
4600 if opts.playliststart <= 0:
4601 raise ValueError(u'Playlist start must be positive')
4602 except (TypeError, ValueError), err:
4603 parser.error(u'invalid playlist start number specified')
4605 opts.playlistend = int(opts.playlistend)
4606 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4607 raise ValueError(u'Playlist end must be greater than playlist start')
4608 except (TypeError, ValueError), err:
4609 parser.error(u'invalid playlist end number specified')
4610 if opts.extractaudio:
4611 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4612 parser.error(u'invalid audio format specified')
4615 fd = FileDownloader({
4616 'usenetrc': opts.usenetrc,
4617 'username': opts.username,
4618 'password': opts.password,
4619 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4620 'forceurl': opts.geturl,
4621 'forcetitle': opts.gettitle,
4622 'forcethumbnail': opts.getthumbnail,
4623 'forcedescription': opts.getdescription,
4624 'forcefilename': opts.getfilename,
4625 'forceformat': opts.getformat,
4626 'simulate': opts.simulate,
4627 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4628 'format': opts.format,
4629 'format_limit': opts.format_limit,
4630 'listformats': opts.listformats,
4631 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4632 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4633 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4634 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4635 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4636 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4637 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4638 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4639 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4640 or u'%(id)s.%(ext)s'),
4641 'ignoreerrors': opts.ignoreerrors,
4642 'ratelimit': opts.ratelimit,
4643 'nooverwrites': opts.nooverwrites,
4644 'retries': opts.retries,
4645 'continuedl': opts.continue_dl,
4646 'noprogress': opts.noprogress,
4647 'playliststart': opts.playliststart,
4648 'playlistend': opts.playlistend,
4649 'logtostderr': opts.outtmpl == '-',
4650 'consoletitle': opts.consoletitle,
4651 'nopart': opts.nopart,
4652 'updatetime': opts.updatetime,
4653 'writedescription': opts.writedescription,
4654 'writeinfojson': opts.writeinfojson,
4655 'writesubtitles': opts.writesubtitles,
4656 'matchtitle': opts.matchtitle,
4657 'rejecttitle': opts.rejecttitle,
4658 'max_downloads': opts.max_downloads,
4659 'prefer_free_formats': opts.prefer_free_formats,
4660 'verbose': opts.verbose,
4662 for extractor in extractors:
4663 fd.add_info_extractor(extractor)
4666 if opts.extractaudio:
4667 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4670 if opts.update_self:
4671 updateSelf(fd, sys.argv[0])
4674 if len(all_urls) < 1:
4675 if not opts.update_self:
4676 parser.error(u'you must provide at least one URL')
4681 retcode = fd.download(all_urls)
4682 except MaxDownloadsReached:
4683 fd.to_screen(u'--max-download limit reached, aborting.')
4686 # Dump cookie jar if requested
4687 if opts.cookiefile is not None:
4690 except (IOError, OSError), err:
4691 sys.exit(u'ERROR: unable to save cookie jar')
4698 except DownloadError:
4700 except SameFileError:
4701 sys.exit(u'ERROR: fixed output name but more than one file to download')
4702 except KeyboardInterrupt:
4703 sys.exit(u'\nERROR: Interrupted by user')
4705 if __name__ == '__main__':
4708 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: