X-Git-Url: http://git.jankratochvil.net/?a=blobdiff_plain;f=youtube_dl%2F__init__.py;h=2404e2359ced09d34dcdb06245998de7e9458cf7;hb=c92e184f751a3b58b5a6fbf090f4882932e5bd4b;hp=a5af555ff1471630f72853bb3c97cd6cfe7c6990;hpb=235b3ba479af9c779dd2609ba07003c0748b1a25;p=youtube-dl.git diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index a5af555..2404e23 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -__author__ = ( +__authors__ = ( 'Ricardo Garcia Gonzalez', 'Danny Colligan', 'Benjamin Johnson', @@ -14,10 +14,11 @@ __author__ = ( 'Sören Schulze', 'Kevin Ngo', 'Ori Avtalion', + 'shizeeg', ) __license__ = 'Public Domain' -__version__ = '2011.10.19' +__version__ = '2011.12.18' UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' @@ -79,8 +80,6 @@ std_headers = { 'Accept-Language': 'en-us,en;q=0.5', } -simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii') - try: import json except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson): @@ -279,6 +278,17 @@ def timeconvert(timestr): timestamp = email.utils.mktime_tz(timetuple) return timestamp +def _simplify_title(title): + expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE) + return expr.sub(u'_', title).strip(u'_') + +def _orderedSet(iterable): + """ Remove all duplicates from the input iterable """ + res = [] + for el in iterable: + if el not in res: + res.append(el) + return res class DownloadError(Exception): """Download Error exception. @@ -307,6 +317,10 @@ class PostProcessingError(Exception): """ pass +class MaxDownloadsReached(Exception): + """ --max-downloads limit has been reached. """ + pass + class UnavailableVideoError(Exception): """Unavailable Format exception. @@ -697,8 +711,31 @@ class FileDownloader(object): self.trouble(u'ERROR: invalid system charset or erroneous output template') return None + def _match_entry(self, info_dict): + """ Returns None iff the file should be downloaded """ + + title = info_dict['title'] + matchtitle = self.params.get('matchtitle', False) + if matchtitle and not re.search(matchtitle, title, re.IGNORECASE): + return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' + rejecttitle = self.params.get('rejecttitle', False) + if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE): + return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' + return None + def process_info(self, info_dict): """Process a single dictionary returned by an InfoExtractor.""" + + reason = self._match_entry(info_dict) + if reason is not None: + self.to_screen(u'[download] ' + reason) + return + + max_downloads = self.params.get('max_downloads') + if max_downloads is not None: + if self._num_downloads > int(max_downloads): + raise MaxDownloadsReached() + filename = self.prepare_filename(info_dict) # Forced printings @@ -722,20 +759,6 @@ class FileDownloader(object): if filename is None: return - matchtitle=self.params.get('matchtitle',False) - rejecttitle=self.params.get('rejecttitle',False) - title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') - if matchtitle and not re.search(matchtitle, title, re.IGNORECASE): - self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle)) - return - if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE): - self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle)) - return - - if self.params.get('nooverwrites', False) and os.path.exists(filename): - self.to_stderr(u'WARNING: file exists and will be skipped') - return - try: dn = os.path.dirname(filename) if dn != '' and not os.path.exists(dn): @@ -777,16 +800,19 @@ class FileDownloader(object): return if not self.params.get('skip_download', False): - try: - success = self._do_download(filename, info_dict) - except (OSError, IOError), err: - raise UnavailableVideoError - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self.trouble(u'ERROR: unable to download video data: %s' % str(err)) - return - except (ContentTooShortError, ), err: - self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) - return + if self.params.get('nooverwrites', False) and os.path.exists(filename): + success = True + else: + try: + success = self._do_download(filename, info_dict) + except (OSError, IOError), err: + raise UnavailableVideoError + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self.trouble(u'ERROR: unable to download video data: %s' % str(err)) + return + except (ContentTooShortError, ), err: + self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) + return if success: try: @@ -1094,6 +1120,7 @@ class YoutubeIE(InfoExtractor): _NETRC_MACHINE = 'youtube' # Listed in order of quality _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] + _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] _video_extensions = { '13': '3gp', '17': 'mp4', @@ -1291,8 +1318,7 @@ class YoutubeIE(InfoExtractor): video_title = sanitize_title(video_title) # simplified title - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) - simple_title = simple_title.strip(ur'_') + simple_title = _simplify_title(video_title) # thumbnail image if 'thumbnail_url' not in video_info: @@ -1344,10 +1370,11 @@ class YoutubeIE(InfoExtractor): url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data) format_limit = self._downloader.params.get('format_limit', None) - if format_limit is not None and format_limit in self._available_formats: - format_list = self._available_formats[self._available_formats.index(format_limit):] + available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats + if format_limit is not None and format_limit in available_formats: + format_list = available_formats[available_formats.index(format_limit):] else: - format_list = self._available_formats + format_list = available_formats existing_formats = [x for x in format_list if x in url_map] if len(existing_formats) == 0: self._downloader.trouble(u'ERROR: no known formats available for video') @@ -1563,6 +1590,8 @@ class DailymotionIE(InfoExtractor): self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) def _real_extract(self, url): + htmlParser = HTMLParser.HTMLParser() + # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -1573,7 +1602,6 @@ class DailymotionIE(InfoExtractor): self._downloader.increment_downloads() video_id = mobj.group(1) - simple_title = mobj.group(2).decode('utf-8') video_extension = 'flv' # Retrieve video webpage to extract further information @@ -1603,12 +1631,13 @@ class DailymotionIE(InfoExtractor): video_url = mediaURL - mobj = re.search(r'(?im)Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?', webpage) + mobj = re.search(r'', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract title') return - video_title = mobj.group(1).decode('utf-8') + video_title = htmlParser.unescape(mobj.group('title')).decode('utf-8') video_title = sanitize_title(video_title) + simple_title = _simplify_title(video_title) mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: @@ -1693,7 +1722,7 @@ class GoogleIE(InfoExtractor): return video_title = mobj.group(1).decode('utf-8') video_title = sanitize_title(video_title) - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + simple_title = _simplify_title(video_title) # Extract video description mobj = re.search(r'([^<]*)', webpage) @@ -1792,7 +1821,7 @@ class PhotobucketIE(InfoExtractor): return video_title = mobj.group(1).decode('utf-8') video_title = sanitize_title(video_title) - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + simple_title = _simplify_title(vide_title) video_uploader = mobj.group(2).decode('utf-8') @@ -1886,7 +1915,7 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract video title') return video_title = mobj.group(1).decode('utf-8') - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + simple_title = _simplify_title(video_title) mobj = re.search(r'

(.*)

', webpage) if mobj is None: @@ -2014,7 +2043,7 @@ class VimeoIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract video title') return video_title = mobj.group(1).decode('utf-8') - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + simple_title = _simplify_title(video_title) # Extract uploader mobj = re.search(r'http://vimeo.com/(.*?)', webpage) @@ -2158,7 +2187,7 @@ class GenericIE(InfoExtractor): return video_title = mobj.group(1).decode('utf-8') video_title = sanitize_title(video_title) - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + simple_title = _simplify_title(video_title) # video uploader is domain name mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) @@ -2828,9 +2857,7 @@ class FacebookIE(InfoExtractor): video_title = video_title.decode('utf-8') video_title = sanitize_title(video_title) - # simplified title - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) - simple_title = simple_title.strip(ur'_') + simple_title = _simplify_title(video_title) # thumbnail image if 'thumbnail' not in video_info: @@ -2921,11 +2948,6 @@ class BlipTVIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title)) - def _simplify_title(self, title): - res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title) - res = res.strip(ur'_') - return res - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -2945,13 +2967,14 @@ class BlipTVIE(InfoExtractor): if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download basename = url.split('/')[-1] title,ext = os.path.splitext(basename) + title = title.decode('UTF-8') ext = ext.replace('.', '') self.report_direct_download(title) info = { 'id': title, 'url': url, 'title': title, - 'stitle': self._simplify_title(title), + 'stitle': _simplify_title(title), 'ext': ext, 'urlhandle': urlh } @@ -2985,7 +3008,7 @@ class BlipTVIE(InfoExtractor): 'uploader': data['display_name'], 'upload_date': upload_date, 'title': data['title'], - 'stitle': self._simplify_title(data['title']), + 'stitle': _simplify_title(data['title']), 'ext': ext, 'format': data['media']['mimeType'], 'thumbnail': data['thumbnailUrl'], @@ -3028,10 +3051,6 @@ class MyVideoIE(InfoExtractor): return video_id = mobj.group(1) - simple_title = mobj.group(2).decode('utf-8') - # should actually not be necessary - simple_title = sanitize_title(simple_title) - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title) # Get video webpage request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id) @@ -3058,6 +3077,8 @@ class MyVideoIE(InfoExtractor): video_title = mobj.group(1) video_title = sanitize_title(video_title) + simple_title = _simplify_title(video_title) + try: self._downloader.process_info({ 'id': video_id, @@ -3091,11 +3112,6 @@ class ComedyCentralIE(InfoExtractor): def report_player_url(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id) - def _simplify_title(self, title): - res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title) - res = res.strip(ur'_') - return res - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -3104,9 +3120,9 @@ class ComedyCentralIE(InfoExtractor): if mobj.group('shortname'): if mobj.group('shortname') in ('tds', 'thedailyshow'): - url = 'http://www.thedailyshow.com/full-episodes/' + url = u'http://www.thedailyshow.com/full-episodes/' else: - url = 'http://www.colbertnation.com/full-episodes/' + url = u'http://www.colbertnation.com/full-episodes/' mobj = re.match(self._VALID_URL, url) assert mobj is not None @@ -3135,7 +3151,7 @@ class ComedyCentralIE(InfoExtractor): return epTitle = mobj.group('episode') - mMovieParams = re.findall('', html) + mMovieParams = re.findall('(?:/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P[^&]+)(&video=(?P