X-Git-Url: http://git.jankratochvil.net/?a=blobdiff_plain;f=youtube-dl;h=d4eadc9059d36a2f396bc207cfb13826eb4f15b4;hb=e092418d8b360aaaf8c7eba67956010f4e363121;hp=884e6f9d2dc87b7fcad96410d6c6d18ba3006a14;hpb=3b98a5ddac9cbf39158b8c2ba5a61d45eee2125e;p=youtube-dl.git

diff --git a/youtube-dl b/youtube-dl
index 884e6f9..d4eadc9 100755
--- a/youtube-dl
+++ b/youtube-dl
@@ -12,6 +12,8 @@ __author__  = (
 	'RogÃ©rio Brito',
 	'Philipp Hagemeister',
 	'SÃ¶ren Schulze',
+	'Kevin Ngo',
+	'Ori Avtalion',
 	)
 
 __license__ = 'Public Domain'
@@ -77,8 +79,6 @@ std_headers = {
 	'Accept-Language': 'en-us,en;q=0.5',
 }
 
-simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
-
 try:
 	import json
 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
@@ -277,6 +277,8 @@ def timeconvert(timestr):
 		timestamp = email.utils.mktime_tz(timetuple)
 	return timestamp
 
+def _simplify_title(title):
+	return re.sub(ur'[^\w\d_\-]+', u'_', title).strip(u'_')
 
 class DownloadError(Exception):
 	"""Download Error exception.
@@ -1289,8 +1291,7 @@ class YoutubeIE(InfoExtractor):
 		video_title = sanitize_title(video_title)
 
 		# simplified title
-		simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
-		simple_title = simple_title.strip(ur'_')
+		simple_title = _simplify_title(video_title)
 
 		# thumbnail image
 		if 'thumbnail_url' not in video_info:
@@ -1560,9 +1561,6 @@ class DailymotionIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 
-	def _real_initialize(self):
-		return
-
 	def _real_extract(self, url):
 		# Extract id and simplified title from URL
 		mobj = re.match(self._VALID_URL, url)
@@ -1651,9 +1649,6 @@ class GoogleIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 
-	def _real_initialize(self):
-		return
-
 	def _real_extract(self, url):
 		# Extract id from URL
 		mobj = re.match(self._VALID_URL, url)
@@ -1697,7 +1692,7 @@ class GoogleIE(InfoExtractor):
 			return
 		video_title = mobj.group(1).decode('utf-8')
 		video_title = sanitize_title(video_title)
-		simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
+		simple_title = _simplify_title(video_title)
 
 		# Extract video description
 		mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
@@ -1758,9 +1753,6 @@ class PhotobucketIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 
-	def _real_initialize(self):
-		return
-
 	def _real_extract(self, url):
 		# Extract id from URL
 		mobj = re.match(self._VALID_URL, url)
@@ -1799,7 +1791,7 @@ class PhotobucketIE(InfoExtractor):
 			return
 		video_title = mobj.group(1).decode('utf-8')
 		video_title = sanitize_title(video_title)
-		simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
+		simple_title = _simplify_title(vide_title)
 
 		video_uploader = mobj.group(2).decode('utf-8')
 
@@ -1840,9 +1832,6 @@ class YahooIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 
-	def _real_initialize(self):
-		return
-
 	def _real_extract(self, url, new_video=True):
 		# Extract ID from URL
 		mobj = re.match(self._VALID_URL, url)
@@ -1896,7 +1885,7 @@ class YahooIE(InfoExtractor):
 			self._downloader.trouble(u'ERROR: unable to extract video title')
 			return
 		video_title = mobj.group(1).decode('utf-8')
-		simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
+		simple_title = _simplify_title(video_title)
 
 		mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 		if mobj is None:
@@ -1993,9 +1982,6 @@ class VimeoIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 
-	def _real_initialize(self):
-		return
-
 	def _real_extract(self, url, new_video=True):
 		# Extract ID from URL
 		mobj = re.match(self._VALID_URL, url)
@@ -2027,7 +2013,7 @@ class VimeoIE(InfoExtractor):
 			self._downloader.trouble(u'ERROR: unable to extract video title')
 			return
 		video_title = mobj.group(1).decode('utf-8')
-		simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
+		simple_title = _simple_title(video_title)
 
 		# Extract uploader
 		mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
@@ -2118,9 +2104,6 @@ class GenericIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
 
-	def _real_initialize(self):
-		return
-
 	def _real_extract(self, url):
 		# At this point we have a new video
 		self._downloader.increment_downloads()
@@ -2174,7 +2157,7 @@ class GenericIE(InfoExtractor):
 			return
 		video_title = mobj.group(1).decode('utf-8')
 		video_title = sanitize_title(video_title)
-		simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
+		simple_title = _simplify_title(video_title)
 
 		# video uploader is domain name
 		mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
@@ -2647,9 +2630,6 @@ class DepositFilesIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
 
-	def _real_initialize(self):
-		return
-
 	def _real_extract(self, url):
 		# At this point we have a new file
 		self._downloader.increment_downloads()
@@ -2847,9 +2827,7 @@ class FacebookIE(InfoExtractor):
 		video_title = video_title.decode('utf-8')
 		video_title = sanitize_title(video_title)
 
-		# simplified title
-		simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
-		simple_title = simple_title.strip(ur'_')
+		simple_title = _simplify_title(video_title)
 
 		# thumbnail image
 		if 'thumbnail' not in video_info:
@@ -2940,11 +2918,6 @@ class BlipTVIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
 
-	def _simplify_title(self, title):
-		res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
-		res = res.strip(ur'_')
-		return res
-
 	def _real_extract(self, url):
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
@@ -2970,7 +2943,7 @@ class BlipTVIE(InfoExtractor):
 					'id': title,
 					'url': url,
 					'title': title,
-					'stitle': self._simplify_title(title),
+					'stitle': _simplify_title(title),
 					'ext': ext,
 					'urlhandle': urlh
 				}
@@ -3004,7 +2977,7 @@ class BlipTVIE(InfoExtractor):
 					'uploader': data['display_name'],
 					'upload_date': upload_date,
 					'title': data['title'],
-					'stitle': self._simplify_title(data['title']),
+					'stitle': _simplify_title(data['title']),
 					'ext': ext,
 					'format': data['media']['mimeType'],
 					'thumbnail': data['thumbnailUrl'],
@@ -3040,9 +3013,6 @@ class MyVideoIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
 
-	def _real_initialize(self):
-		return
-
 	def _real_extract(self,url):
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
@@ -3050,10 +3020,6 @@ class MyVideoIE(InfoExtractor):
 			return
 
 		video_id = mobj.group(1)
-		simple_title = mobj.group(2).decode('utf-8')
-		# should actually not be necessary
-		simple_title = sanitize_title(simple_title)
-		simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
 
 		# Get video webpage
 		request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
@@ -3080,6 +3046,8 @@ class MyVideoIE(InfoExtractor):
 		video_title = mobj.group(1)
 		video_title = sanitize_title(video_title)
 
+		simple_title = _simplify_title(video_title)
+
 		try:
 			self._downloader.process_info({
 				'id':		video_id,
@@ -3113,11 +3081,6 @@ class ComedyCentralIE(InfoExtractor):
 	def report_player_url(self, episode_id):
 		self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
 
-	def _simplify_title(self, title):
-		res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
-		res = res.strip(ur'_')
-		return res
-
 	def _real_extract(self, url):
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
@@ -3248,11 +3211,6 @@ class EscapistIE(InfoExtractor):
 	def report_config_download(self, showName):
 		self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
 
-	def _simplify_title(self, title):
-		res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
-		res = res.strip(ur'_')
-		return res
-
 	def _real_extract(self, url):
 		htmlParser = HTMLParser.HTMLParser()
 
@@ -3305,7 +3263,7 @@ class EscapistIE(InfoExtractor):
 			'uploader': showName,
 			'upload_date': None,
 			'title': showName,
-			'stitle': self._simplify_title(showName),
+			'stitle': _simplify_title(showName),
 			'ext': 'flv',
 			'format': 'flv',
 			'thumbnail': imgUrl,
@@ -3333,11 +3291,6 @@ class CollegeHumorIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 
-	def _simplify_title(self, title):
-		res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
-		res = res.strip(ur'_')
-		return res
-
 	def _real_extract(self, url):
 		htmlParser = HTMLParser.HTMLParser()
 
@@ -3379,7 +3332,7 @@ class CollegeHumorIE(InfoExtractor):
 			videoNode = mdoc.findall('./video')[0]
 			info['description'] = videoNode.findall('./description')[0].text
 			info['title'] = videoNode.findall('./caption')[0].text
-			info['stitle'] = self._simplify_title(info['title'])
+			info['stitle'] = _simplify_title(info['title'])
 			info['url'] = videoNode.findall('./file')[0].text
 			info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 			info['ext'] = info['url'].rpartition('.')[2]
@@ -3410,11 +3363,6 @@ class XVideosIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 
-	def _simplify_title(self, title):
-		res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
-		res = res.strip(ur'_')
-		return res
-
 	def _real_extract(self, url):
 		htmlParser = HTMLParser.HTMLParser()
 
@@ -3468,7 +3416,7 @@ class XVideosIE(InfoExtractor):
 			'uploader': None,
 			'upload_date': None,
 			'title': video_title,
-			'stitle': self._simplify_title(video_title),
+			'stitle': _simplify_title(video_title),
 			'ext': 'flv',
 			'format': 'flv',
 			'thumbnail': video_thumbnail,
@@ -3482,6 +3430,104 @@ class XVideosIE(InfoExtractor):
 			self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
 
 
+class SoundcloudIE(InfoExtractor):
+	"""Information extractor for soundcloud.com
+	   To access the media, the uid of the song and a stream token
+	   must be extracted from the page source and the script must make
+	   a request to media.soundcloud.com/crossdomain.xml. Then
+	   the media can be grabbed by requesting from an url composed
+	   of the stream token and uid
+	 """
+
+	_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
+	IE_NAME = u'soundcloud'
+
+	def __init__(self, downloader=None):
+		InfoExtractor.__init__(self, downloader)
+
+	def report_webpage(self, video_id):
+		"""Report information extraction."""
+		self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
+
+	def report_extraction(self, video_id):
+		"""Report information extraction."""
+		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
+
+	def _real_extract(self, url):
+		htmlParser = HTMLParser.HTMLParser()
+
+		mobj = re.match(self._VALID_URL, url)
+		if mobj is None:
+			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+			return
+
+		# extract uploader (which is in the url)
+		uploader = mobj.group(1).decode('utf-8')
+		# extract simple title (uploader + slug of song title)
+		slug_title =  mobj.group(2).decode('utf-8')
+		simple_title = uploader + '-' + slug_title
+
+		self.report_webpage('%s/%s' % (uploader, slug_title))
+
+		request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
+		try:
+			webpage = urllib2.urlopen(request).read()
+		except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+			self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
+			return
+
+		self.report_extraction('%s/%s' % (uploader, slug_title))
+
+		# extract uid and stream token that soundcloud hands out for access
+		mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
+		if mobj:
+			video_id = mobj.group(1)
+			stream_token = mobj.group(2)
+
+		# extract unsimplified title
+		mobj = re.search('"title":"(.*?)",', webpage)
+		if mobj:
+			title = mobj.group(1)
+
+		# construct media url (with uid/token)
+		mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
+		mediaURL = mediaURL % (video_id, stream_token)
+
+		# description
+		description = u'No description available'
+		mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
+		if mobj:
+			description = mobj.group(1)
+		
+		# upload date
+		upload_date = None
+		mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
+		if mobj:
+			try:
+				upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
+			except Exception as e:
+				print str(e)
+
+		# for soundcloud, a request to a cross domain is required for cookies
+		request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
+
+		try:
+			self._downloader.process_info({
+				'id':		video_id.decode('utf-8'),
+				'url':		mediaURL,
+				'uploader':	uploader.decode('utf-8'),
+				'upload_date':  upload_date,
+				'title':	simple_title.decode('utf-8'),
+				'stitle':	simple_title.decode('utf-8'),
+				'ext':		u'mp3',
+				'format':	u'NA',
+				'player_url':	None,
+				'description': description.decode('utf-8')
+			})
+		except UnavailableVideoError:
+			self._downloader.trouble(u'\nERROR: unable to download video')
+
+
 class InfoQIE(InfoExtractor):
 	"""Information extractor for infoq.com"""
 
@@ -3496,11 +3542,6 @@ class InfoQIE(InfoExtractor):
 		"""Report information extraction."""
 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 
-	def _simplify_title(self, title):
-		res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
-		res = res.strip(ur'_')
-		return res
-
 	def _real_extract(self, url):
 		htmlParser = HTMLParser.HTMLParser()
 
@@ -3536,7 +3577,6 @@ class InfoQIE(InfoExtractor):
 			return
 		video_title = mobj.group(1).decode('utf-8')
 
-
 		# Extract description
 		video_description = u'No description available.'
 		mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
@@ -3553,7 +3593,7 @@ class InfoQIE(InfoExtractor):
 			'uploader': None,
 			'upload_date': None,
 			'title': video_title,
-			'stitle': self._simplify_title(video_title),
+			'stitle': _simplify_title(video_title),
 			'ext': extension,
 			'format': extension, # Extension is always(?) mp4, but seems to be flv
 			'thumbnail': None,
@@ -3567,6 +3607,7 @@ class InfoQIE(InfoExtractor):
 			self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
 
 
+
 class PostProcessor(object):
 	"""Post Processor class.
 
@@ -3963,12 +4004,13 @@ def gen_extractors():
 		EscapistIE(),
 		CollegeHumorIE(),
 		XVideosIE(),
+		SoundcloudIE(),
 		InfoQIE(),
 
 		GenericIE()
 	]
 
-def main():
+def _real_main():
 	parser, opts, args = parseOpts()
 
 	# Open appropriate CookieJar
@@ -4128,10 +4170,9 @@ def main():
 
 	sys.exit(retcode)
 
-
-if __name__ == '__main__':
+def main():
 	try:
-		main()
+		_real_main()
 	except DownloadError:
 		sys.exit(1)
 	except SameFileError:
@@ -4139,4 +4180,7 @@ if __name__ == '__main__':
 	except KeyboardInterrupt:
 		sys.exit(u'\nERROR: Interrupted by user')
 
+if __name__ == '__main__':
+	main()
+
 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: