X-Git-Url: http://git.jankratochvil.net/?p=youtube-dl.git;a=blobdiff_plain;f=youtube-dl;h=949d2e532d38eb154d9063aedfe6660ad12bed81;hp=d3582c442b0d95dd6048aae7d88a9621868cc7b1;hb=871be928a89be62f89ec21a1d9cb700dbddcaeca;hpb=7125a7ca8b5e5228a8199cb95779f83fd3ee05e2 diff --git a/youtube-dl b/youtube-dl index d3582c4..949d2e5 100755 --- a/youtube-dl +++ b/youtube-dl @@ -15,7 +15,7 @@ __author__ = ( ) __license__ = 'Public Domain' -__version__ = '2011.09.30' +__version__ = '2011.10.19' UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' @@ -1236,7 +1236,7 @@ class YoutubeIE(InfoExtractor): # Get video webpage self.report_video_webpage_download(video_id) - request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) + request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) try: video_webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: @@ -2059,6 +2059,18 @@ class VimeoIE(InfoExtractor): return sig = mobj.group(1).decode('utf-8') + # Vimeo specific: extract video quality information + mobj = re.search(r'(\d+)', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video quality information') + return + quality = mobj.group(1).decode('utf-8') + + if int(quality) == 1: + quality = 'hd' + else: + quality = 'sd' + # Vimeo specific: Extract request signature expiration mobj = re.search(r'(.*?)', webpage) if mobj is None: @@ -2066,7 +2078,7 @@ class VimeoIE(InfoExtractor): return sig_exp = mobj.group(1).decode('utf-8') - video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp) + video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality) try: # Process video information @@ -2458,7 +2470,7 @@ class YahooSearchIE(InfoExtractor): class YoutubePlaylistIE(InfoExtractor): """Information Extractor for YouTube playlists.""" - _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' + _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' @@ -2697,11 +2709,12 @@ class DepositFilesIE(InfoExtractor): class FacebookIE(InfoExtractor): """Information Extractor for Facebook""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/video/video\.php\?(?:.*?)v=(?P\d+)(?:.*)' + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P\d+)(?:.*)' _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&' _NETRC_MACHINE = 'facebook' - _available_formats = ['highqual', 'lowqual'] + _available_formats = ['video', 'highqual', 'lowqual'] _video_extensions = { + 'video': 'mp4', 'highqual': 'mp4', 'lowqual': 'mp4', } @@ -2729,10 +2742,9 @@ class FacebookIE(InfoExtractor): def _parse_page(self, video_webpage): """Extract video information from page""" # General data - data = {'title': r'class="video_title datawrap">(.*?)(.*?)', 'owner': r'\("video_owner_name", "(.*?)"\)', - 'upload_date': r'data-date="(.*?)"', 'thumbnail': r'\("thumb_url", "(?P.*?)"\)', } video_info = {} @@ -3306,6 +3318,264 @@ class EscapistIE(InfoExtractor): self._downloader.trouble(u'\nERROR: unable to download ' + videoId) +class CollegeHumorIE(InfoExtractor): + """Information extractor for collegehumor.com""" + + _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P[0-9]+)/(?P.*)$' + IE_NAME = u'collegehumor' + + def report_webpage(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + + def _simplify_title(self, title): + res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title) + res = res.strip(ur'_') + return res + + def _real_extract(self, url): + htmlParser = HTMLParser.HTMLParser() + + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + video_id = mobj.group('videoid') + + self.report_webpage(video_id) + request = urllib2.Request(url) + try: + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + return + + m = re.search(r'id="video:(?P[0-9]+)"', webpage) + if m is None: + self._downloader.trouble(u'ERROR: Cannot extract internal video ID') + return + internal_video_id = m.group('internalvideoid') + + info = { + 'id': video_id, + 'internal_id': internal_video_id, + } + + self.report_extraction(video_id) + xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id + try: + metaXml = urllib2.urlopen(xmlUrl).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err)) + return + + mdoc = xml.etree.ElementTree.fromstring(metaXml) + try: + videoNode = mdoc.findall('./video')[0] + info['description'] = videoNode.findall('./description')[0].text + info['title'] = videoNode.findall('./caption')[0].text + info['stitle'] = self._simplify_title(info['title']) + info['url'] = videoNode.findall('./file')[0].text + info['thumbnail'] = videoNode.findall('./thumbnail')[0].text + info['ext'] = info['url'].rpartition('.')[2] + info['format'] = info['ext'] + except IndexError: + self._downloader.trouble(u'\nERROR: Invalid metadata XML file') + return + + self._downloader.increment_downloads() + + try: + self._downloader.process_info(info) + except UnavailableVideoError, err: + self._downloader.trouble(u'\nERROR: unable to download video') + + +class XVideosIE(InfoExtractor): + """Information extractor for xvideos.com""" + + _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' + IE_NAME = u'xvideos' + + def report_webpage(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + + def _simplify_title(self, title): + res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title) + res = res.strip(ur'_') + return res + + def _real_extract(self, url): + htmlParser = HTMLParser.HTMLParser() + + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + video_id = mobj.group(1).decode('utf-8') + + self.report_webpage(video_id) + + request = urllib2.Request(r'http://www.xvideos.com/video' + video_id) + try: + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + return + + self.report_extraction(video_id) + + + # Extract video URL + mobj = re.search(r'flv_url=(.+?)&', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video url') + return + video_url = urllib2.unquote(mobj.group(1).decode('utf-8')) + + + # Extract title + mobj = re.search(r'(.*?)\s+-\s+XVID', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video title') + return + video_title = mobj.group(1).decode('utf-8') + + + # Extract video thumbnail + mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video thumbnail') + return + video_thumbnail = mobj.group(1).decode('utf-8') + + + + self._downloader.increment_downloads() + info = { + 'id': video_id, + 'url': video_url, + 'uploader': None, + 'upload_date': None, + 'title': video_title, + 'stitle': self._simplify_title(video_title), + 'ext': 'flv', + 'format': 'flv', + 'thumbnail': video_thumbnail, + 'description': None, + 'player_url': None, + } + + try: + self._downloader.process_info(info) + except UnavailableVideoError, err: + self._downloader.trouble(u'\nERROR: unable to download ' + video_id) + + +class SoundcloudIE(InfoExtractor): + """Information extractor for soundcloud.com + To access the media, the uid of the song and a stream token + must be extracted from the page source and the script must make + a request to media.soundcloud.com/crossdomain.xml. Then + the media can be grabbed by requesting from an url composed + of the stream token and uid + """ + + _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)' + IE_NAME = u'soundcloud' + + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) + + def report_webpage(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + + def _real_initialize(self): + return + + def _real_extract(self, url): + htmlParser = HTMLParser.HTMLParser() + + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + + # extract uploader (which is in the url) + uploader = mobj.group(1).decode('utf-8') + # extract simple title (uploader + slug of song title) + slug_title = mobj.group(2).decode('utf-8') + simple_title = uploader + '-' + slug_title + + self.report_webpage('%s/%s' % (uploader, slug_title)) + + request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title)) + try: + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + return + + self.report_extraction('%s/%s' % (uploader, slug_title)) + + # extract uid and access token + mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage) + if mobj: + video_id = mobj.group(1) + stream_token = mobj.group(2) + + # construct media url (with uid/token) to request song + mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s" + mediaURL = mediaURL % (video_id, stream_token) + + # description + description = u'No description available' + mobj = re.search('track-description-value"><p>(.*?)</p>', webpage) + if mobj: + description = mobj.group(1) + + # upload date + upload_date = None + mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage) + if mobj: + try: + upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d') + except Exception as e: + print str(e) + + # for soundcloud, a request must be made to a cross domain to establish + # needed cookies + request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers) + + try: + self._downloader.process_info({ + 'id': video_id, + 'url': mediaURL, + 'uploader': uploader, + 'upload_date': upload_date, + 'title': simple_title, + 'stitle': simple_title, + 'ext': u'mp3', + 'format': u'NA', + 'player_url': None, + 'description': description + }) + except UnavailableVideoError: + self._downloader.trouble(u'\nERROR: unable to download video') class PostProcessor(object): """Post Processor class. @@ -3701,6 +3971,9 @@ def gen_extractors(): MyVideoIE(), ComedyCentralIE(), EscapistIE(), + CollegeHumorIE(), + XVideosIE(), + SoundcloudIE(), GenericIE() ]