Credit Filippo Valsorda
[youtube-dl.git] / youtube-dl
index 5c81973..5224611 100755 (executable)
@@ -491,6 +491,7 @@ class FileDownloader(object):
        writedescription: Write the video description to a .description file
        writeinfojson:    Write the video description to a .info.json file
        writesubtitles:   Write the video subtitles to a .srt file
+       subtitleslang:    Language of the subtitles to download
        """
 
        params = None
@@ -1443,17 +1444,24 @@ class YoutubeIE(InfoExtractor):
                        else:
                                srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
                                if srt_lang_list:
-                                       if 'en' in srt_lang_list: srt_lang = 'en'
-                                       else: srt_lang = srt_lang_list[0] # TODO choose better and provide an override
-                                       request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
-                                       try:
-                                               srt_xml = urllib2.urlopen(request).read()
-                                       except (urllib2.URLError, httplib.HTTPException, socket.error), err:
-                                               self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+                                       if self._downloader.params.get('subtitleslang', False):
+                                               srt_lang = self._downloader.params.get('subtitleslang')
+                                       elif 'en' in srt_lang_list:
+                                               srt_lang = 'en'
+                                       else:
+                                               srt_lang = srt_lang_list[0]
+                                       if not srt_lang in srt_lang_list:
+                                               self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
                                        else:
-                                               video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
+                                               request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
+                                               try:
+                                                       srt_xml = urllib2.urlopen(request).read()
+                                               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                                                       self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+                                               else:
+                                                       video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
                                else:
-                                       self._downloader.trouble(u'WARNING: video has no subtitles')
+                                       self._downloader.trouble(u'WARNING: video has no closed captions')
 
                # token
                video_token = urllib.unquote_plus(video_info['token'][0])
@@ -2314,9 +2322,7 @@ class GenericIE(InfoExtractor):
 class YoutubeSearchIE(InfoExtractor):
        """Information Extractor for YouTube search queries."""
        _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
-       _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
-       _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
-       _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
+       _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
        _youtube_ie = None
        _max_youtube_results = 1000
        IE_NAME = u'youtube:search'
@@ -2367,37 +2373,31 @@ class YoutubeSearchIE(InfoExtractor):
                """Downloads a specified number of results for a query"""
 
                video_ids = []
-               already_seen = set()
-               pagenum = 1
+               pagenum = 0
+               limit = n
 
-               while True:
-                       self.report_download_page(query, pagenum)
-                       result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
+               while (50 * pagenum) < limit:
+                       self.report_download_page(query, pagenum+1)
+                       result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
                        request = urllib2.Request(result_url)
                        try:
-                               page = urllib2.urlopen(request).read()
+                               data = urllib2.urlopen(request).read()
                        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
-                               self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
+                               self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
                                return
+                       api_response = json.loads(data)['data']
 
-                       # Extract video identifiers
-                       for mobj in re.finditer(self._VIDEO_INDICATOR, page):
-                               video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
-                               if video_id not in already_seen:
-                                       video_ids.append(video_id)
-                                       already_seen.add(video_id)
-                                       if len(video_ids) == n:
-                                               # Specified n videos reached
-                                               for id in video_ids:
-                                                       self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
-                                               return
+                       new_ids = list(video['id'] for video in api_response['items'])
+                       video_ids += new_ids
 
-                       if re.search(self._MORE_PAGES_INDICATOR, page) is None:
-                               for id in video_ids:
-                                       self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
-                               return
+                       limit = min(n, api_response['totalItems'])
+                       pagenum += 1
 
-                       pagenum = pagenum + 1
+               if len(video_ids) > n:
+                       video_ids = video_ids[:n]
+               for id in video_ids:
+                       self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
+               return
 
 
 class GoogleSearchIE(InfoExtractor):
@@ -2581,7 +2581,7 @@ class YoutubePlaylistIE(InfoExtractor):
 
        _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
        _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
-       _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
+       _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
        _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
        _youtube_ie = None
        IE_NAME = u'youtube:playlist'
@@ -2633,7 +2633,7 @@ class YoutubePlaylistIE(InfoExtractor):
 
                        # Extract video identifiers
                        ids_in_page = []
-                       for mobj in re.finditer(self._VIDEO_INDICATOR, page):
+                       for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
                                if mobj.group(1) not in ids_in_page:
                                        ids_in_page.append(mobj.group(1))
                        video_ids.extend(ids_in_page)
@@ -2644,7 +2644,10 @@ class YoutubePlaylistIE(InfoExtractor):
 
                playliststart = self._downloader.params.get('playliststart', 1) - 1
                playlistend = self._downloader.params.get('playlistend', -1)
-               video_ids = video_ids[playliststart:playlistend]
+               if playlistend == -1:
+                       video_ids = video_ids[playliststart:]
+               else:
+                       video_ids = video_ids[playliststart:playlistend]
 
                for id in video_ids:
                        self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
@@ -4390,6 +4393,12 @@ def parseOpts():
                        action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
        video_format.add_option('-F', '--list-formats',
                        action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
+       video_format.add_option('--write-srt',
+                       action='store_true', dest='writesubtitles',
+                       help='write video closed captions to a .srt file (currently youtube only)', default=False)
+       video_format.add_option('--srt-lang',
+                       action='store', dest='subtitleslang', metavar='LANG',
+                       help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
 
 
        verbosity.add_option('-q', '--quiet',
@@ -4454,9 +4463,6 @@ def parseOpts():
        filesystem.add_option('--write-info-json',
                        action='store_true', dest='writeinfojson',
                        help='write video metadata to a .info.json file', default=False)
-       filesystem.add_option('--write-srt',
-                       action='store_true', dest='writesubtitles',
-                       help='write video subtitles to a .srt file', default=False)
 
 
        postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
@@ -4658,6 +4664,7 @@ def _real_main():
                'writedescription': opts.writedescription,
                'writeinfojson': opts.writeinfojson,
                'writesubtitles': opts.writesubtitles,
+               'subtitleslang': opts.subtitleslang,
                'matchtitle': opts.matchtitle,
                'rejecttitle': opts.rejecttitle,
                'max_downloads': opts.max_downloads,