Merge remote-tracking branch 'ngokevin/soundcloud'
[youtube-dl.git] / youtube-dl
index 6eafc30..ff01775 100755 (executable)
@@ -2470,7 +2470,7 @@ class YahooSearchIE(InfoExtractor):
 class YoutubePlaylistIE(InfoExtractor):
        """Information Extractor for YouTube playlists."""
 
-       _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
+       _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
        _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
        _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
        _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
@@ -2514,7 +2514,8 @@ class YoutubePlaylistIE(InfoExtractor):
 
                while True:
                        self.report_download_page(playlist_id, pagenum)
-                       request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
+                       url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
+                       request = urllib2.Request(url)
                        try:
                                page = urllib2.urlopen(request).read()
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@@ -2548,7 +2549,7 @@ class YoutubeUserIE(InfoExtractor):
        _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
        _GDATA_PAGE_SIZE = 50
        _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
-       _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
+       _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
        _youtube_ie = None
        IE_NAME = u'youtube:user'
 
@@ -3532,50 +3533,56 @@ class SoundcloudIE(InfoExtractor):
 
                self.report_extraction('%s/%s' % (uploader, slug_title))
 
-               # extract uid and access token
-               mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', page)   
+               # extract uid and stream token that soundcloud hands out for access
+               mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)   
+               if mobj:
+                       video_id = mobj.group(1)
+                       stream_token = mobj.group(2)
+
+               # extract unsimplified title
+               mobj = re.search('"title":"(.*?)",', webpage)
                if mobj:
-                       video_id = match.group(1)
-                       stream_token = match.group(2)
+                       title = mobj.group(1)
 
-               # construct media url (with uid/token) to request song
+               # construct media url (with uid/token)
                mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
                mediaURL = mediaURL % (video_id, stream_token)
 
                # description
                description = u'No description available'
-               mobj = re.search('track-description-value"><p>(.*?)</p>', page)
+               mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
                if mobj:
                        description = mobj.group(1)
                
                # upload date
-               mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", page)
+               upload_date = None
+               mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
                if mobj:
                        try:
-                               upload_date = datetime.datetime.strptime(match.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
-                       except:
-                               pass
+                               upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
+                       except Exception as e:
+                               print str(e)
 
-               # for soundcloud, a request must be made to a cross domain to establish
-               # needed cookies
+               # for soundcloud, a request to a cross domain is required for cookies
                request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
 
                try:
                        self._downloader.process_info({
-                               'id':           video_id,
-                               'url':          video_url,
-                               'uploader':     uploader,
+                               'id':           video_id.decode('utf-8'),
+                               'url':          mediaURL,
+                               'uploader':     uploader.decode('utf-8'),
                                'upload_date':  upload_date,
-                               'title':        video_title,
-                               'stitle':       simple_title,
+                               'title':        simple_title.decode('utf-8'),
+                               'stitle':       simple_title.decode('utf-8'),
                                'ext':          u'mp3',
                                'format':       u'NA',
                                'player_url':   None,
-                               'description': description
+                               'description': description.decode('utf-8')
                        })
                except UnavailableVideoError:
                        self._downloader.trouble(u'\nERROR: unable to download video')
 
+
 class PostProcessor(object):
        """Post Processor class.
 
@@ -3972,7 +3979,7 @@ def gen_extractors():
                EscapistIE(),
                CollegeHumorIE(),
                XVideosIE(),
-        SoundcloudIE(),
+               SoundcloudIE(),
 
                GenericIE()
        ]