X-Git-Url: http://git.jankratochvil.net/?p=youtube-dl.git;a=blobdiff_plain;f=youtube-dl;h=c1274ebca48c98a743a71c2da8af85a10410029c;hp=579a33d0c9fa3e23c9c7c7ab239fa58068220af3;hb=HEAD;hpb=99e207bab0f6e3c2e70fc5e4ba2154ca536de0eb diff --git a/youtube-dl b/youtube-dl index 579a33d..c1274eb 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -__author__ = ( +__authors__ = ( 'Ricardo Garcia Gonzalez', 'Danny Colligan', 'Benjamin Johnson', @@ -12,15 +12,20 @@ __author__ = ( 'Rogério Brito', 'Philipp Hagemeister', 'Sören Schulze', + 'Kevin Ngo', + 'Ori Avtalion', + 'shizeeg', ) __license__ = 'Public Domain' -__version__ = '2011.10.19' +__version__ = '2012.02.27' UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' + import cookielib import datetime +import getpass import gzip import htmlentitydefs import HTMLParser @@ -28,9 +33,11 @@ import httplib import locale import math import netrc +import optparse import os import os.path import re +import shlex import socket import string import subprocess @@ -77,8 +84,6 @@ std_headers = { 'Accept-Language': 'en-us,en;q=0.5', } -simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii') - try: import json except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson): @@ -258,14 +263,14 @@ def sanitize_open(filename, open_mode): import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) return (sys.stdout, filename) - stream = open(filename, open_mode) + stream = open(_encodeFilename(filename), open_mode) return (stream, filename) except (IOError, OSError), err: # In case of error, try to remove win32 forbidden chars filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) # An exception here should be caught in the caller - stream = open(filename, open_mode) + stream = open(_encodeFilename(filename), open_mode) return (stream, filename) @@ -277,6 +282,41 @@ def timeconvert(timestr): timestamp = email.utils.mktime_tz(timetuple) return timestamp +def _simplify_title(title): + expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE) + return expr.sub(u'_', title).strip(u'_') + +def _orderedSet(iterable): + """ Remove all duplicates from the input iterable """ + res = [] + for el in iterable: + if el not in res: + res.append(el) + return res + +def _unescapeHTML(s): + """ + @param s a string (of type unicode) + """ + assert type(s) == type(u'') + + htmlParser = HTMLParser.HTMLParser() + return htmlParser.unescape(s) + +def _encodeFilename(s): + """ + @param s The name of the file (of type unicode) + """ + + assert type(s) == type(u'') + + if sys.platform == 'win32' and sys.getwindowsversion().major >= 5: + # Pass u'' directly to use Unicode APIs on Windows 2000 and up + # (Detecting Windows NT 4 is tricky because 'major >= 4' would + # match Windows 9x series as well. Besides, NT 4 is obsolete.) + return s + else: + return s.encode(sys.getfilesystemencoding(), 'ignore') class DownloadError(Exception): """Download Error exception. @@ -305,6 +345,10 @@ class PostProcessingError(Exception): """ pass +class MaxDownloadsReached(Exception): + """ --max-downloads limit has been reached. """ + pass + class UnavailableVideoError(Exception): """Unavailable Format exception. @@ -446,6 +490,8 @@ class FileDownloader(object): updatetime: Use the Last-modified header to set output file timestamps. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file + writesubtitles: Write the video subtitles to a .srt file + subtitleslang: Language of the subtitles to download """ params = None @@ -538,16 +584,17 @@ class FileDownloader(object): self._pps.append(pp) pp.set_downloader(self) - def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False): + def to_screen(self, message, skip_eol=False): """Print message to stdout if not in quiet mode.""" - try: - if not self.params.get('quiet', False): - terminator = [u'\n', u''][skip_eol] - print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()), + assert type(message) == type(u'') + if not self.params.get('quiet', False): + terminator = [u'\n', u''][skip_eol] + output = message + terminator + + if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr + output = output.encode(preferredencoding(), 'ignore') + self._screen_file.write(output) self._screen_file.flush() - except (UnicodeEncodeError), err: - if not ignore_encoding_errors: - raise def to_stderr(self, message): """Print message to stderr.""" @@ -597,7 +644,7 @@ class FileDownloader(object): def temp_name(self, filename): """Returns a temporary filename for the given filename.""" if self.params.get('nopart', False) or filename == u'-' or \ - (os.path.exists(filename) and not os.path.isfile(filename)): + (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))): return filename return filename + u'.part' @@ -610,7 +657,7 @@ class FileDownloader(object): try: if old_filename == new_filename: return - os.rename(old_filename, new_filename) + os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename)) except (IOError, OSError), err: self.trouble(u'ERROR: unable to rename file') @@ -618,7 +665,7 @@ class FileDownloader(object): """Try to set the last-modified time of the given file.""" if last_modified_hdr is None: return - if not os.path.isfile(filename): + if not os.path.isfile(_encodeFilename(filename)): return timestr = last_modified_hdr if timestr is None: @@ -634,15 +681,19 @@ class FileDownloader(object): def report_writedescription(self, descfn): """ Report that the description file is being written """ - self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True) + self.to_screen(u'[info] Writing video description to: ' + descfn) + + def report_writesubtitles(self, srtfn): + """ Report that the subtitles file is being written """ + self.to_screen(u'[info] Writing video subtitles to: ' + srtfn) def report_writeinfojson(self, infofn): """ Report that the metadata file has been written """ - self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True) + self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) def report_destination(self, filename): """Report destination filename.""" - self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True) + self.to_screen(u'[download] Destination: ' + filename) def report_progress(self, percent_str, data_len_str, speed_str, eta_str): """Report download progress.""" @@ -695,8 +746,31 @@ class FileDownloader(object): self.trouble(u'ERROR: invalid system charset or erroneous output template') return None + def _match_entry(self, info_dict): + """ Returns None iff the file should be downloaded """ + + title = info_dict['title'] + matchtitle = self.params.get('matchtitle', False) + if matchtitle and not re.search(matchtitle, title, re.IGNORECASE): + return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' + rejecttitle = self.params.get('rejecttitle', False) + if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE): + return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' + return None + def process_info(self, info_dict): """Process a single dictionary returned by an InfoExtractor.""" + + reason = self._match_entry(info_dict) + if reason is not None: + self.to_screen(u'[download] ' + reason) + return + + max_downloads = self.params.get('max_downloads') + if max_downloads is not None: + if self._num_downloads > int(max_downloads): + raise MaxDownloadsReached() + filename = self.prepare_filename(info_dict) # Forced printings @@ -720,23 +794,9 @@ class FileDownloader(object): if filename is None: return - matchtitle=self.params.get('matchtitle',False) - rejecttitle=self.params.get('rejecttitle',False) - title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') - if matchtitle and not re.search(matchtitle, title, re.IGNORECASE): - self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle)) - return - if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE): - self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle)) - return - - if self.params.get('nooverwrites', False) and os.path.exists(filename): - self.to_stderr(u'WARNING: file exists and will be skipped') - return - try: - dn = os.path.dirname(filename) - if dn != '' and not os.path.exists(dn): + dn = os.path.dirname(_encodeFilename(filename)) + if dn != '' and not os.path.exists(dn): # dn is already encoded os.makedirs(dn) except (OSError, IOError), err: self.trouble(u'ERROR: unable to create directory ' + unicode(err)) @@ -744,9 +804,9 @@ class FileDownloader(object): if self.params.get('writedescription', False): try: - descfn = filename + '.description' + descfn = filename + u'.description' self.report_writedescription(descfn) - descfile = open(descfn, 'wb') + descfile = open(_encodeFilename(descfn), 'wb') try: descfile.write(info_dict['description'].encode('utf-8')) finally: @@ -754,9 +814,24 @@ class FileDownloader(object): except (OSError, IOError): self.trouble(u'ERROR: Cannot write description file ' + descfn) return + + if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: + # subtitles download errors are already managed as troubles in relevant IE + # that way it will silently go on when used with unsupporting IE + try: + srtfn = filename.rsplit('.', 1)[0] + u'.srt' + self.report_writesubtitles(srtfn) + srtfile = open(_encodeFilename(srtfn), 'wb') + try: + srtfile.write(info_dict['subtitles'].encode('utf-8')) + finally: + srtfile.close() + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) + return if self.params.get('writeinfojson', False): - infofn = filename + '.info.json' + infofn = filename + u'.info.json' self.report_writeinfojson(infofn) try: json.dump @@ -764,7 +839,7 @@ class FileDownloader(object): self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.') return try: - infof = open(infofn, 'wb') + infof = open(_encodeFilename(infofn), 'wb') try: json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',)) json.dump(json_info_dict, infof) @@ -775,16 +850,19 @@ class FileDownloader(object): return if not self.params.get('skip_download', False): - try: - success = self._do_download(filename, info_dict) - except (OSError, IOError), err: - raise UnavailableVideoError - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self.trouble(u'ERROR: unable to download video data: %s' % str(err)) - return - except (ContentTooShortError, ), err: - self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) - return + if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)): + success = True + else: + try: + success = self._do_download(filename, info_dict) + except (OSError, IOError), err: + raise UnavailableVideoError + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self.trouble(u'ERROR: unable to download video data: %s' % str(err)) + return + except (ContentTooShortError, ), err: + self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) + return if success: try: @@ -843,13 +921,21 @@ class FileDownloader(object): # the connection was interrumpted and resuming appears to be # possible. This is part of rtmpdump's normal usage, AFAIK. basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename] - retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]) + args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)] + if self.params.get('verbose', False): + try: + import pipes + shell_quote = lambda args: ' '.join(map(pipes.quote, args)) + except ImportError: + shell_quote = repr + self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) + retval = subprocess.call(args) while retval == 2 or retval == 1: - prevsize = os.path.getsize(tmpfilename) + prevsize = os.path.getsize(_encodeFilename(tmpfilename)) self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) time.sleep(5.0) # This seems to be needed retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) - cursize = os.path.getsize(tmpfilename) + cursize = os.path.getsize(_encodeFilename(tmpfilename)) if prevsize == cursize and retval == 1: break # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those @@ -858,7 +944,7 @@ class FileDownloader(object): retval = 0 break if retval == 0: - self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename)) + self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename))) self.try_rename(tmpfilename, filename) return True else: @@ -870,7 +956,7 @@ class FileDownloader(object): player_url = info_dict.get('player_url', None) # Check file already present - if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False): + if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False): self.report_file_already_downloaded(filename) return True @@ -887,8 +973,8 @@ class FileDownloader(object): request = urllib2.Request(url, None, headers) # Establish possible resume length - if os.path.isfile(tmpfilename): - resume_len = os.path.getsize(tmpfilename) + if os.path.isfile(_encodeFilename(tmpfilename)): + resume_len = os.path.getsize(_encodeFilename(tmpfilename)) else: resume_len = 0 @@ -1091,7 +1177,8 @@ class YoutubeIE(InfoExtractor): _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NETRC_MACHINE = 'youtube' # Listed in order of quality - _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] + _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] + _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] _video_extensions = { '13': '3gp', '17': 'mp4', @@ -1102,6 +1189,7 @@ class YoutubeIE(InfoExtractor): '43': 'webm', '44': 'webm', '45': 'webm', + '46': 'webm', } _video_dimensions = { '5': '240x400', @@ -1117,6 +1205,7 @@ class YoutubeIE(InfoExtractor): '43': '360x640', '44': '480x854', '45': '720x1280', + '46': '1080x1920', } IE_NAME = u'youtube' @@ -1140,6 +1229,10 @@ class YoutubeIE(InfoExtractor): """Report attempt to download video info webpage.""" self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) + def report_video_subtitles_download(self, video_id): + """Report attempt to download video info webpage.""" + self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) + def report_information_extraction(self, video_id): """Report attempt to extract video information.""" self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) @@ -1152,6 +1245,23 @@ class YoutubeIE(InfoExtractor): """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') + def _closed_captions_xml_to_srt(self, xml_string): + srt = '' + texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) + # TODO parse xml instead of regex + for n, (start, dur_tag, dur, caption) in enumerate(texts): + if not dur: dur = '4' + start = float(start) + end = start + float(dur) + start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) + end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) + caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) + caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional + srt += str(n) + '\n' + srt += start + ' --> ' + end + '\n' + srt += caption + '\n\n' + return srt + def _print_formats(self, formats): print 'Available formats:' for x in formats: @@ -1289,8 +1399,7 @@ class YoutubeIE(InfoExtractor): video_title = sanitize_title(video_title) # simplified title - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) - simple_title = simple_title.strip(ur'_') + simple_title = _simplify_title(video_title) # thumbnail image if 'thumbnail_url' not in video_info: @@ -1316,15 +1425,45 @@ class YoutubeIE(InfoExtractor): lxml.etree except NameError: video_description = u'No description available.' - if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False): - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_description = mobj.group(1).decode('utf-8') + mobj = re.search(r'', video_webpage) + if mobj is not None: + video_description = mobj.group(1).decode('utf-8') else: html_parser = lxml.etree.HTMLParser(encoding='utf-8') vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) # TODO use another parser + + # closed captions + video_subtitles = None + if self._downloader.params.get('writesubtitles', False): + self.report_video_subtitles_download(video_id) + request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + srt_list = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + else: + srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list) + if srt_lang_list: + if self._downloader.params.get('subtitleslang', False): + srt_lang = self._downloader.params.get('subtitleslang') + elif 'en' in srt_lang_list: + srt_lang = 'en' + else: + srt_lang = srt_lang_list[0] + if not srt_lang in srt_lang_list: + self._downloader.trouble(u'WARNING: no closed captions found in the specified language') + else: + request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) + try: + srt_xml = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + else: + video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + else: + self._downloader.trouble(u'WARNING: video has no closed captions') # token video_token = urllib.unquote_plus(video_info['token'][0]) @@ -1342,10 +1481,11 @@ class YoutubeIE(InfoExtractor): url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data) format_limit = self._downloader.params.get('format_limit', None) - if format_limit is not None and format_limit in self._available_formats: - format_list = self._available_formats[self._available_formats.index(format_limit):] + available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats + if format_limit is not None and format_limit in available_formats: + format_list = available_formats[available_formats.index(format_limit):] else: - format_list = self._available_formats + format_list = available_formats existing_formats = [x for x in format_list if x in url_map] if len(existing_formats) == 0: self._downloader.trouble(u'ERROR: no known formats available for video') @@ -1396,6 +1536,7 @@ class YoutubeIE(InfoExtractor): 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, 'player_url': player_url, + 'subtitles': video_subtitles }) except UnavailableVideoError, err: self._downloader.trouble(u'\nERROR: unable to download video') @@ -1560,9 +1701,6 @@ class DailymotionIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) - def _real_initialize(self): - return - def _real_extract(self, url): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) @@ -1574,7 +1712,6 @@ class DailymotionIE(InfoExtractor): self._downloader.increment_downloads() video_id = mobj.group(1) - simple_title = mobj.group(2).decode('utf-8') video_extension = 'flv' # Retrieve video webpage to extract further information @@ -1604,12 +1741,13 @@ class DailymotionIE(InfoExtractor): video_url = mediaURL - mobj = re.search(r'(?im)Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?', webpage) + mobj = re.search(r'', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract title') return - video_title = mobj.group(1).decode('utf-8') + video_title = _unescapeHTML(mobj.group('title').decode('utf-8')) video_title = sanitize_title(video_title) + simple_title = _simplify_title(video_title) mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: @@ -1651,9 +1789,6 @@ class GoogleIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id) - def _real_initialize(self): - return - def _real_extract(self, url): # Extract id from URL mobj = re.match(self._VALID_URL, url) @@ -1697,7 +1832,7 @@ class GoogleIE(InfoExtractor): return video_title = mobj.group(1).decode('utf-8') video_title = sanitize_title(video_title) - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + simple_title = _simplify_title(video_title) # Extract video description mobj = re.search(r'([^<]*)', webpage) @@ -1758,9 +1893,6 @@ class PhotobucketIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id) - def _real_initialize(self): - return - def _real_extract(self, url): # Extract id from URL mobj = re.match(self._VALID_URL, url) @@ -1799,7 +1931,7 @@ class PhotobucketIE(InfoExtractor): return video_title = mobj.group(1).decode('utf-8') video_title = sanitize_title(video_title) - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + simple_title = _simplify_title(vide_title) video_uploader = mobj.group(2).decode('utf-8') @@ -1840,9 +1972,6 @@ class YahooIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id) - def _real_initialize(self): - return - def _real_extract(self, url, new_video=True): # Extract ID from URL mobj = re.match(self._VALID_URL, url) @@ -1896,7 +2025,7 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract video title') return video_title = mobj.group(1).decode('utf-8') - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + simple_title = _simplify_title(video_title) mobj = re.search(r'

(.*)

', webpage) if mobj is None: @@ -1993,9 +2122,6 @@ class VimeoIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id) - def _real_initialize(self): - return - def _real_extract(self, url, new_video=True): # Extract ID from URL mobj = re.match(self._VALID_URL, url) @@ -2008,7 +2134,7 @@ class VimeoIE(InfoExtractor): video_id = mobj.group(1) # Retrieve video webpage to extract further information - request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers) + request = urllib2.Request(url, None, std_headers) try: self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() @@ -2021,77 +2147,75 @@ class VimeoIE(InfoExtractor): # and latter we extract those that are Vimeo specific. self.report_extraction(video_id) - # Extract title - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + # Extract the config JSON + config = webpage.split(' = {config:')[1].split(',assets:')[0] + try: + config = json.loads(config) + except: + self._downloader.trouble(u'ERROR: unable to extract info section') return - video_title = mobj.group(1).decode('utf-8') - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + + # Extract title + video_title = config["video"]["title"] + simple_title = _simplify_title(video_title) # Extract uploader - mobj = re.search(r'http://vimeo.com/(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video uploader') - return - video_uploader = mobj.group(1).decode('utf-8') + video_uploader = config["video"]["owner"]["name"] # Extract video thumbnail - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') - return - video_thumbnail = mobj.group(1).decode('utf-8') - - # # Extract video description - # mobj = re.search(r'', webpage) - # if mobj is None: - # self._downloader.trouble(u'ERROR: unable to extract video description') - # return - # video_description = mobj.group(1).decode('utf-8') - # if not video_description: video_description = 'No description available.' - video_description = 'Foo.' - - # Vimeo specific: extract request signature - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract request signature') - return - sig = mobj.group(1).decode('utf-8') - - # Vimeo specific: extract video quality information - mobj = re.search(r'(\d+)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video quality information') - return - quality = mobj.group(1).decode('utf-8') + video_thumbnail = config["video"]["thumbnail"] - if int(quality) == 1: - quality = 'hd' + # Extract video description + try: + lxml.etree + except NameError: + video_description = u'No description available.' + mobj = re.search(r'', webpage, re.MULTILINE) + if mobj is not None: + video_description = mobj.group(1) else: - quality = 'sd' + html_parser = lxml.etree.HTMLParser() + vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) + video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() + # TODO use another parser - # Vimeo specific: Extract request signature expiration - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract request signature expiration') + # Extract upload date + video_upload_date = u'NA' + mobj = re.search(r'', webpage) + if mobj is not None: + video_upload_date = mobj.group(1) + + # Vimeo specific: extract request signature and timestamp + sig = config['request']['signature'] + timestamp = config['request']['timestamp'] + + # Vimeo specific: extract video codec and quality information + # TODO bind to format param + codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] + for codec in codecs: + if codec[0] in config["video"]["files"]: + video_codec = codec[0] + video_extension = codec[1] + if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd' + else: quality = 'sd' + break + else: + self._downloader.trouble(u'ERROR: no known codec found') return - sig_exp = mobj.group(1).decode('utf-8') - video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality) + video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ + %(video_id, sig, timestamp, quality, video_codec.upper()) try: # Process video information self._downloader.process_info({ - 'id': video_id.decode('utf-8'), + 'id': video_id, 'url': video_url, 'uploader': video_uploader, - 'upload_date': u'NA', + 'upload_date': video_upload_date, 'title': video_title, 'stitle': simple_title, - 'ext': u'mp4', - 'thumbnail': video_thumbnail.decode('utf-8'), - 'description': video_description, + 'ext': video_extension, 'thumbnail': video_thumbnail, 'description': video_description, 'player_url': None, @@ -2118,9 +2242,6 @@ class GenericIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id) - def _real_initialize(self): - return - def _real_extract(self, url): # At this point we have a new video self._downloader.increment_downloads() @@ -2174,7 +2295,7 @@ class GenericIE(InfoExtractor): return video_title = mobj.group(1).decode('utf-8') video_title = sanitize_title(video_title) - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + simple_title = _simplify_title(video_title) # video uploader is domain name mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) @@ -2203,9 +2324,7 @@ class GenericIE(InfoExtractor): class YoutubeSearchIE(InfoExtractor): """Information Extractor for YouTube search queries.""" _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+' - _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en' - _VIDEO_INDICATOR = r'href="/watch\?v=.+?"' - _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' + _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' _youtube_ie = None _max_youtube_results = 1000 IE_NAME = u'youtube:search' @@ -2256,45 +2375,39 @@ class YoutubeSearchIE(InfoExtractor): """Downloads a specified number of results for a query""" video_ids = [] - already_seen = set() - pagenum = 1 + pagenum = 0 + limit = n - while True: - self.report_download_page(query, pagenum) - result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) + while (50 * pagenum) < limit: + self.report_download_page(query, pagenum+1) + result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1) request = urllib2.Request(result_url) try: - page = urllib2.urlopen(request).read() + data = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err)) return + api_response = json.loads(data)['data'] - # Extract video identifiers - for mobj in re.finditer(self._VIDEO_INDICATOR, page): - video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1] - if video_id not in already_seen: - video_ids.append(video_id) - already_seen.add(video_id) - if len(video_ids) == n: - # Specified n videos reached - for id in video_ids: - self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) - return + new_ids = list(video['id'] for video in api_response['items']) + video_ids += new_ids - if re.search(self._MORE_PAGES_INDICATOR, page) is None: - for id in video_ids: - self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) - return + limit = min(n, api_response['totalItems']) + pagenum += 1 - pagenum = pagenum + 1 + if len(video_ids) > n: + video_ids = video_ids[:n] + for id in video_ids: + self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) + return class GoogleSearchIE(InfoExtractor): """Information Extractor for Google Video search queries.""" _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+' _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en' - _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&' - _MORE_PAGES_INDICATOR = r'Next' + _VIDEO_INDICATOR = r'\s*Next\s*' _youtube_ie = None IE_NAME = u'youtube:playlist' @@ -2514,7 +2625,8 @@ class YoutubePlaylistIE(InfoExtractor): while True: self.report_download_page(playlist_id, pagenum) - request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)) + url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum) + request = urllib2.Request(url) try: page = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: @@ -2523,7 +2635,7 @@ class YoutubePlaylistIE(InfoExtractor): # Extract video identifiers ids_in_page = [] - for mobj in re.finditer(self._VIDEO_INDICATOR, page): + for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page): if mobj.group(1) not in ids_in_page: ids_in_page.append(mobj.group(1)) video_ids.extend(ids_in_page) @@ -2534,7 +2646,10 @@ class YoutubePlaylistIE(InfoExtractor): playliststart = self._downloader.params.get('playliststart', 1) - 1 playlistend = self._downloader.params.get('playlistend', -1) - video_ids = video_ids[playliststart:playlistend] + if playlistend == -1: + video_ids = video_ids[playliststart:] + else: + video_ids = video_ids[playliststart:playlistend] for id in video_ids: self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) @@ -2548,7 +2663,7 @@ class YoutubeUserIE(InfoExtractor): _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' _GDATA_PAGE_SIZE = 50 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' - _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' + _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' _youtube_ie = None IE_NAME = u'youtube:user' @@ -2622,7 +2737,7 @@ class YoutubeUserIE(InfoExtractor): else: video_ids = video_ids[playliststart:playlistend] - self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" % + self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" % (username, all_ids_count, len(video_ids))) for video_id in video_ids: @@ -2646,9 +2761,6 @@ class DepositFilesIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id) - def _real_initialize(self): - return - def _real_extract(self, url): # At this point we have a new file self._downloader.increment_downloads() @@ -2709,7 +2821,7 @@ class DepositFilesIE(InfoExtractor): class FacebookIE(InfoExtractor): """Information Extractor for Facebook""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/video/video\.php\?(?:.*?)v=(?P\d+)(?:.*)' + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P\d+)(?:.*)' _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&' _NETRC_MACHINE = 'facebook' _available_formats = ['video', 'highqual', 'lowqual'] @@ -2745,7 +2857,6 @@ class FacebookIE(InfoExtractor): data = {'title': r'\("video_title", "(.*?)"\)', 'description': r'
(.*?)
', 'owner': r'\("video_owner_name", "(.*?)"\)', - 'upload_date': r'data-date="(.*?)"', 'thumbnail': r'\("thumb_url", "(?P.*?)"\)', } video_info = {} @@ -2847,9 +2958,7 @@ class FacebookIE(InfoExtractor): video_title = video_title.decode('utf-8') video_title = sanitize_title(video_title) - # simplified title - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) - simple_title = simple_title.strip(ur'_') + simple_title = _simplify_title(video_title) # thumbnail image if 'thumbnail' not in video_info: @@ -2940,11 +3049,6 @@ class BlipTVIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title)) - def _simplify_title(self, title): - res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title) - res = res.strip(ur'_') - return res - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -2964,13 +3068,14 @@ class BlipTVIE(InfoExtractor): if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download basename = url.split('/')[-1] title,ext = os.path.splitext(basename) + title = title.decode('UTF-8') ext = ext.replace('.', '') self.report_direct_download(title) info = { 'id': title, 'url': url, 'title': title, - 'stitle': self._simplify_title(title), + 'stitle': _simplify_title(title), 'ext': ext, 'urlhandle': urlh } @@ -3004,7 +3109,7 @@ class BlipTVIE(InfoExtractor): 'uploader': data['display_name'], 'upload_date': upload_date, 'title': data['title'], - 'stitle': self._simplify_title(data['title']), + 'stitle': _simplify_title(data['title']), 'ext': ext, 'format': data['media']['mimeType'], 'thumbnail': data['thumbnailUrl'], @@ -3040,9 +3145,6 @@ class MyVideoIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id) - def _real_initialize(self): - return - def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -3050,10 +3152,6 @@ class MyVideoIE(InfoExtractor): return video_id = mobj.group(1) - simple_title = mobj.group(2).decode('utf-8') - # should actually not be necessary - simple_title = sanitize_title(simple_title) - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title) # Get video webpage request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id) @@ -3080,6 +3178,8 @@ class MyVideoIE(InfoExtractor): video_title = mobj.group(1) video_title = sanitize_title(video_title) + simple_title = _simplify_title(video_title) + try: self._downloader.process_info({ 'id': video_id, @@ -3113,11 +3213,6 @@ class ComedyCentralIE(InfoExtractor): def report_player_url(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id) - def _simplify_title(self, title): - res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title) - res = res.strip(ur'_') - return res - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -3126,9 +3221,9 @@ class ComedyCentralIE(InfoExtractor): if mobj.group('shortname'): if mobj.group('shortname') in ('tds', 'thedailyshow'): - url = 'http://www.thedailyshow.com/full-episodes/' + url = u'http://www.thedailyshow.com/full-episodes/' else: - url = 'http://www.colbertnation.com/full-episodes/' + url = u'http://www.colbertnation.com/full-episodes/' mobj = re.match(self._VALID_URL, url) assert mobj is not None @@ -3157,7 +3252,7 @@ class ComedyCentralIE(InfoExtractor): return epTitle = mobj.group('episode') - mMovieParams = re.findall('', html) + mMovieParams = re.findall('(?:

(.*?)

', webpage) + if mobj: + description = mobj.group(1) + + # upload date + upload_date = None + mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)", webpage) + if mobj: + try: + upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d') + except Exception, e: + print str(e) + + # for soundcloud, a request to a cross domain is required for cookies + request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers) + + try: + self._downloader.process_info({ + 'id': video_id.decode('utf-8'), + 'url': mediaURL, + 'uploader': uploader.decode('utf-8'), + 'upload_date': upload_date, + 'title': simple_title.decode('utf-8'), + 'stitle': simple_title.decode('utf-8'), + 'ext': u'mp3', + 'format': u'NA', + 'player_url': None, + 'description': description.decode('utf-8') + }) + except UnavailableVideoError: + self._downloader.trouble(u'\nERROR: unable to download video') + + +class InfoQIE(InfoExtractor): + """Information extractor for infoq.com""" + + _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' + IE_NAME = u'infoq' + + def report_webpage(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + + def _real_extract(self, url): + htmlParser = HTMLParser.HTMLParser() + + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + + self.report_webpage(url) + + request = urllib2.Request(url) + try: + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + return + + self.report_extraction(url) + + + # Extract video URL + mobj = re.search(r"jsclassref='([^']*)'", webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video url') + return + video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64')) + + + # Extract title + mobj = re.search(r'contentTitle = "(.*?)";', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video title') + return + video_title = mobj.group(1).decode('utf-8') + + # Extract description + video_description = u'No description available.' + mobj = re.search(r'', webpage) + if mobj is not None: + video_description = mobj.group(1).decode('utf-8') + + video_filename = video_url.split('/')[-1] + video_id, extension = video_filename.split('.') + + self._downloader.increment_downloads() + info = { + 'id': video_id, + 'url': video_url, + 'uploader': None, + 'upload_date': None, + 'title': video_title, + 'stitle': _simplify_title(video_title), + 'ext': extension, + 'format': extension, # Extension is always(?) mp4, but seems to be flv + 'thumbnail': None, + 'description': video_description, + 'player_url': None, + } + + try: + self._downloader.process_info(info) + except UnavailableVideoError, err: + self._downloader.trouble(u'\nERROR: unable to download ' + video_url) + +class MixcloudIE(InfoExtractor): + """Information extractor for www.mixcloud.com""" + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' + IE_NAME = u'mixcloud' + + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) + + def report_download_json(self, file_id): + """Report JSON download.""" + self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME) + + def report_extraction(self, file_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id)) + + def get_urls(self, jsonData, fmt, bitrate='best'): + """Get urls from 'audio_formats' section in json""" + file_url = None + try: + bitrate_list = jsonData[fmt] + if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: + bitrate = max(bitrate_list) # select highest + + url_list = jsonData[fmt][bitrate] + except TypeError: # we have no bitrate info. + url_list = jsonData[fmt] + + return url_list + + def check_urls(self, url_list): + """Returns 1st active url from list""" + for url in url_list: + try: + urllib2.urlopen(url) + return url + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + url = None + + return None + + def _print_formats(self, formats): + print 'Available formats:' + for fmt in formats.keys(): + for b in formats[fmt]: + try: + ext = formats[fmt][b][0] + print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]) + except TypeError: # we have no bitrate info + ext = formats[fmt][0] + print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]) + break + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + # extract uploader & filename from url + uploader = mobj.group(1).decode('utf-8') + file_id = uploader + "-" + mobj.group(2).decode('utf-8') + + # construct API request + file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' + # retrieve .json file with links to files + request = urllib2.Request(file_url) + try: + self.report_download_json(file_url) + jsonData = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err)) + return + + # parse JSON + json_data = json.loads(jsonData) + player_url = json_data['player_swf_url'] + formats = dict(json_data['audio_formats']) + + req_format = self._downloader.params.get('format', None) + bitrate = None + + if self._downloader.params.get('listformats', None): + self._print_formats(formats) + return + + if req_format is None or req_format == 'best': + for format_param in formats.keys(): + url_list = self.get_urls(formats, format_param) + # check urls + file_url = self.check_urls(url_list) + if file_url is not None: + break # got it! + else: + if req_format not in formats.keys(): + self._downloader.trouble(u'ERROR: format is not available') + return + + url_list = self.get_urls(formats, req_format) + file_url = self.check_urls(url_list) + format_param = req_format + + # We have audio + self._downloader.increment_downloads() + try: + # Process file information + self._downloader.process_info({ + 'id': file_id.decode('utf-8'), + 'url': file_url.decode('utf-8'), + 'uploader': uploader.decode('utf-8'), + 'upload_date': u'NA', + 'title': json_data['name'], + 'stitle': _simplify_title(json_data['name']), + 'ext': file_url.split('.')[-1].decode('utf-8'), + 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), + 'thumbnail': json_data['thumbnail_url'], + 'description': json_data['description'], + 'player_url': player_url.decode('utf-8'), + }) + except UnavailableVideoError, err: + self._downloader.trouble(u'ERROR: unable to download file') + +class StanfordOpenClassroomIE(InfoExtractor): + """Information extractor for Stanford's Open ClassRoom""" + + _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P[^&]+)(&video=(?P