X-Git-Url: https://git.jankratochvil.net/?a=blobdiff_plain;f=youtube-dl;h=5c81973cd4ea33c1c470fa44d4940dce2f7ef90c;hb=115d243428b788b98fd8f1654e58a2cf27833c61;hp=b22f1cac274fa165563f1f7e32b5a5ee474644f6;hpb=38ffbc022232762ad2b936d070b4fbaf3f9acb09;p=youtube-dl.git diff --git a/youtube-dl b/youtube-dl index b22f1ca..5c81973 100755 --- a/youtube-dl +++ b/youtube-dl @@ -18,12 +18,14 @@ __authors__ = ( ) __license__ = 'Public Domain' -__version__ = '2012.01.08' +__version__ = '2012.02.27' UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' + import cookielib import datetime +import getpass import gzip import htmlentitydefs import HTMLParser @@ -31,9 +33,11 @@ import httplib import locale import math import netrc +import optparse import os import os.path import re +import shlex import socket import string import subprocess @@ -305,7 +309,14 @@ def _encodeFilename(s): """ assert type(s) == type(u'') - return s.encode(sys.getfilesystemencoding(), 'ignore') + + if sys.platform == 'win32' and sys.getwindowsversion().major >= 5: + # Pass u'' directly to use Unicode APIs on Windows 2000 and up + # (Detecting Windows NT 4 is tricky because 'major >= 4' would + # match Windows 9x series as well. Besides, NT 4 is obsolete.) + return s + else: + return s.encode(sys.getfilesystemencoding(), 'ignore') class DownloadError(Exception): """Download Error exception. @@ -479,6 +490,7 @@ class FileDownloader(object): updatetime: Use the Last-modified header to set output file timestamps. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file + writesubtitles: Write the video subtitles to a .srt file """ params = None @@ -670,6 +682,10 @@ class FileDownloader(object): """ Report that the description file is being written """ self.to_screen(u'[info] Writing video description to: ' + descfn) + def report_writesubtitles(self, srtfn): + """ Report that the subtitles file is being written """ + self.to_screen(u'[info] Writing video subtitles to: ' + srtfn) + def report_writeinfojson(self, infofn): """ Report that the metadata file has been written """ self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) @@ -797,6 +813,21 @@ class FileDownloader(object): except (OSError, IOError): self.trouble(u'ERROR: Cannot write description file ' + descfn) return + + if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: + # subtitles download errors are already managed as troubles in relevant IE + # that way it will silently go on when used with unsupporting IE + try: + srtfn = filename.rsplit('.', 1)[0] + u'.srt' + self.report_writesubtitles(srtfn) + srtfile = open(_encodeFilename(srtfn), 'wb') + try: + srtfile.write(info_dict['subtitles'].encode('utf-8')) + finally: + srtfile.close() + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) + return if self.params.get('writeinfojson', False): infofn = filename + u'.info.json' @@ -889,7 +920,15 @@ class FileDownloader(object): # the connection was interrumpted and resuming appears to be # possible. This is part of rtmpdump's normal usage, AFAIK. basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename] - retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]) + args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)] + if self.params.get('verbose', False): + try: + import pipes + shell_quote = lambda args: ' '.join(map(pipes.quote, args)) + except ImportError: + shell_quote = repr + self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) + retval = subprocess.call(args) while retval == 2 or retval == 1: prevsize = os.path.getsize(_encodeFilename(tmpfilename)) self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) @@ -1187,6 +1226,10 @@ class YoutubeIE(InfoExtractor): """Report attempt to download video info webpage.""" self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) + def report_video_subtitles_download(self, video_id): + """Report attempt to download video info webpage.""" + self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) + def report_information_extraction(self, video_id): """Report attempt to extract video information.""" self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) @@ -1199,6 +1242,23 @@ class YoutubeIE(InfoExtractor): """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') + def _closed_captions_xml_to_srt(self, xml_string): + srt = '' + texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) + # TODO parse xml instead of regex + for n, (start, dur_tag, dur, caption) in enumerate(texts): + if not dur: dur = '4' + start = float(start) + end = start + float(dur) + start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) + end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) + caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) + caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional + srt += str(n) + '\n' + srt += start + ' --> ' + end + '\n' + srt += caption + '\n\n' + return srt + def _print_formats(self, formats): print 'Available formats:' for x in formats: @@ -1362,15 +1422,38 @@ class YoutubeIE(InfoExtractor): lxml.etree except NameError: video_description = u'No description available.' - if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False): - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_description = mobj.group(1).decode('utf-8') + mobj = re.search(r'', video_webpage) + if mobj is not None: + video_description = mobj.group(1).decode('utf-8') else: html_parser = lxml.etree.HTMLParser(encoding='utf-8') vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) # TODO use another parser + + # closed captions + video_subtitles = None + if self._downloader.params.get('writesubtitles', False): + self.report_video_subtitles_download(video_id) + request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + srt_list = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + else: + srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list) + if srt_lang_list: + if 'en' in srt_lang_list: srt_lang = 'en' + else: srt_lang = srt_lang_list[0] # TODO choose better and provide an override + request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) + try: + srt_xml = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + else: + video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + else: + self._downloader.trouble(u'WARNING: video has no subtitles') # token video_token = urllib.unquote_plus(video_info['token'][0]) @@ -1443,6 +1526,7 @@ class YoutubeIE(InfoExtractor): 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, 'player_url': player_url, + 'subtitles': video_subtitles }) except UnavailableVideoError, err: self._downloader.trouble(u'\nERROR: unable to download video') @@ -2040,7 +2124,7 @@ class VimeoIE(InfoExtractor): video_id = mobj.group(1) # Retrieve video webpage to extract further information - request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers) + request = urllib2.Request(url, None, std_headers) try: self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() @@ -2053,77 +2137,75 @@ class VimeoIE(InfoExtractor): # and latter we extract those that are Vimeo specific. self.report_extraction(video_id) - # Extract title - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + # Extract the config JSON + config = webpage.split(' = {config:')[1].split(',assets:')[0] + try: + config = json.loads(config) + except: + self._downloader.trouble(u'ERROR: unable to extract info section') return - video_title = mobj.group(1).decode('utf-8') + + # Extract title + video_title = config["video"]["title"] simple_title = _simplify_title(video_title) # Extract uploader - mobj = re.search(r'http://vimeo.com/(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video uploader') - return - video_uploader = mobj.group(1).decode('utf-8') + video_uploader = config["video"]["owner"]["name"] # Extract video thumbnail - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') - return - video_thumbnail = mobj.group(1).decode('utf-8') + video_thumbnail = config["video"]["thumbnail"] - # # Extract video description - # mobj = re.search(r'', webpage) - # if mobj is None: - # self._downloader.trouble(u'ERROR: unable to extract video description') - # return - # video_description = mobj.group(1).decode('utf-8') - # if not video_description: video_description = 'No description available.' - video_description = 'Foo.' - - # Vimeo specific: extract request signature - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract request signature') - return - sig = mobj.group(1).decode('utf-8') - - # Vimeo specific: extract video quality information - mobj = re.search(r'(\d+)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video quality information') - return - quality = mobj.group(1).decode('utf-8') - - if int(quality) == 1: - quality = 'hd' + # Extract video description + try: + lxml.etree + except NameError: + video_description = u'No description available.' + mobj = re.search(r'', webpage, re.MULTILINE) + if mobj is not None: + video_description = mobj.group(1) else: - quality = 'sd' + html_parser = lxml.etree.HTMLParser() + vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) + video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() + # TODO use another parser - # Vimeo specific: Extract request signature expiration - mobj = re.search(r'(.*?)', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract request signature expiration') + # Extract upload date + video_upload_date = u'NA' + mobj = re.search(r'', webpage) + if mobj is not None: + video_upload_date = mobj.group(1) + + # Vimeo specific: extract request signature and timestamp + sig = config['request']['signature'] + timestamp = config['request']['timestamp'] + + # Vimeo specific: extract video codec and quality information + # TODO bind to format param + codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] + for codec in codecs: + if codec[0] in config["video"]["files"]: + video_codec = codec[0] + video_extension = codec[1] + if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd' + else: quality = 'sd' + break + else: + self._downloader.trouble(u'ERROR: no known codec found') return - sig_exp = mobj.group(1).decode('utf-8') - video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality) + video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ + %(video_id, sig, timestamp, quality, video_codec.upper()) try: # Process video information self._downloader.process_info({ - 'id': video_id.decode('utf-8'), + 'id': video_id, 'url': video_url, 'uploader': video_uploader, - 'upload_date': u'NA', + 'upload_date': video_upload_date, 'title': video_title, 'stitle': simple_title, - 'ext': u'mp4', - 'thumbnail': video_thumbnail.decode('utf-8'), - 'description': video_description, + 'ext': video_extension, 'thumbnail': video_thumbnail, 'description': video_description, 'player_url': None, @@ -2322,8 +2404,8 @@ class GoogleSearchIE(InfoExtractor): """Information Extractor for Google Video search queries.""" _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+' _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en' - _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&' - _MORE_PAGES_INDICATOR = r'Next' + _VIDEO_INDICATOR = r'