X-Git-Url: http://git.jankratochvil.net/?a=blobdiff_plain;f=youtube-dl;h=6770678d2d4c4a655ea46e547bfd0e3169f00969;hb=d76736fc5eb0e1f17bad036082e7361b357730e6;hp=3d8c18c6e564401b6a1e761e90e8c50541576d05;hpb=2761012f69f063c65dce35f5667fdbeedad6837e;p=youtube-dl.git diff --git a/youtube-dl b/youtube-dl index 3d8c18c..6770678 100755 --- a/youtube-dl +++ b/youtube-dl @@ -15,7 +15,7 @@ __author__ = ( ) __license__ = 'Public Domain' -__version__ = '2011.09.27' +__version__ = '2011.09.30' UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' @@ -1236,7 +1236,7 @@ class YoutubeIE(InfoExtractor): # Get video webpage self.report_video_webpage_download(video_id) - request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) + request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) try: video_webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: @@ -2458,7 +2458,7 @@ class YahooSearchIE(InfoExtractor): class YoutubePlaylistIE(InfoExtractor): """Information Extractor for YouTube playlists.""" - _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' + _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' @@ -3306,6 +3306,168 @@ class EscapistIE(InfoExtractor): self._downloader.trouble(u'\nERROR: unable to download ' + videoId) +class CollegeHumorIE(InfoExtractor): + """Information extractor for collegehumor.com""" + + _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P[0-9]+)/(?P.*)$' + IE_NAME = u'collegehumor' + + def report_webpage(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + + def _simplify_title(self, title): + res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title) + res = res.strip(ur'_') + return res + + def _real_extract(self, url): + htmlParser = HTMLParser.HTMLParser() + + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + video_id = mobj.group('videoid') + + self.report_webpage(video_id) + request = urllib2.Request(url) + try: + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + return + + m = re.search(r'id="video:(?P[0-9]+)"', webpage) + if m is None: + self._downloader.trouble(u'ERROR: Cannot extract internal video ID') + return + internal_video_id = m.group('internalvideoid') + + info = { + 'id': video_id, + 'internal_id': internal_video_id, + } + + self.report_extraction(video_id) + xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id + try: + metaXml = urllib2.urlopen(xmlUrl).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err)) + return + + mdoc = xml.etree.ElementTree.fromstring(metaXml) + try: + videoNode = mdoc.findall('./video')[0] + info['description'] = videoNode.findall('./description')[0].text + info['title'] = videoNode.findall('./caption')[0].text + info['stitle'] = self._simplify_title(info['title']) + info['url'] = videoNode.findall('./file')[0].text + info['thumbnail'] = videoNode.findall('./thumbnail')[0].text + info['ext'] = info['url'].rpartition('.')[2] + info['format'] = info['ext'] + except IndexError: + self._downloader.trouble(u'\nERROR: Invalid metadata XML file') + return + + self._downloader.increment_downloads() + + try: + self._downloader.process_info(info) + except UnavailableVideoError, err: + self._downloader.trouble(u'\nERROR: unable to download video') + + +class XVideosIE(InfoExtractor): + """Information extractor for xvideos.com""" + + _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' + IE_NAME = u'xvideos' + + def report_webpage(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + + def _simplify_title(self, title): + res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title) + res = res.strip(ur'_') + return res + + def _real_extract(self, url): + htmlParser = HTMLParser.HTMLParser() + + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + video_id = mobj.group(1).decode('utf-8') + + self.report_webpage(video_id) + + request = urllib2.Request(r'http://www.xvideos.com/video' + video_id) + try: + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + return + + self.report_extraction(video_id) + + + # Extract video URL + mobj = re.search(r'flv_url=(.+?)&', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video url') + return + video_url = urllib2.unquote(mobj.group(1).decode('utf-8')) + + + # Extract title + mobj = re.search(r'(.*?)\s+-\s+XVID', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video title') + return + video_title = mobj.group(1).decode('utf-8') + + + # Extract video thumbnail + mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video thumbnail') + return + video_thumbnail = mobj.group(1).decode('utf-8') + + + + self._downloader.increment_downloads() + info = { + 'id': video_id, + 'url': video_url, + 'uploader': None, + 'upload_date': None, + 'title': video_title, + 'stitle': self._simplify_title(video_title), + 'ext': 'flv', + 'format': 'flv', + 'thumbnail': video_thumbnail, + 'description': None, + 'player_url': None, + } + + try: + self._downloader.process_info(info) + except UnavailableVideoError, err: + self._downloader.trouble(u'\nERROR: unable to download ' + video_id) + class PostProcessor(object): """Post Processor class. @@ -3701,6 +3863,8 @@ def gen_extractors(): MyVideoIE(), ComedyCentralIE(), EscapistIE(), + CollegeHumorIE(), + XVideosIE(), GenericIE() ]