Document -o %(upload_date)s (Closes #228)

[youtube-dl.git] / youtube-dl
diff --git a/youtube-dl b/youtube-dl

index d4eadc9..63ad30f 100755 (executable)
--- a/youtube-dl
+++ b/youtube-dl
@@ -14,10 +14,11 @@ __author__  = (
         'Sören Schulze',
         'Kevin Ngo',
         'Ori Avtalion',
+       'shizeeg',
         )
  
  __license__ = 'Public Domain'
-__version__ = '2011.10.19'
+__version__ = '2011.11.23'
  
  UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  
@@ -278,7 +279,8 @@ def timeconvert(timestr):
         return timestamp
  
  def _simplify_title(title):
-       return re.sub(ur'[^\w\d_\-]+', u'_', title).strip(u'_')
+       expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
+       return expr.sub(u'_', title).strip(u'_')
  
  class DownloadError(Exception):
         """Download Error exception.
@@ -699,6 +701,13 @@ class FileDownloader(object):
  
         def process_info(self, info_dict):
                 """Process a single dictionary returned by an InfoExtractor."""
+
+               max_downloads = int(self.params.get('max_downloads'))
+               if max_downloads is not None:
+                       if self._num_downloads > max_downloads:
+                               self.to_screen(u'[download] Maximum number of downloads reached. Skipping ' + info_dict['title'])
+                               return
+               
                 filename = self.prepare_filename(info_dict)
                 
                 # Forced printings
@@ -2013,7 +2022,7 @@ class VimeoIE(InfoExtractor):
                         self._downloader.trouble(u'ERROR: unable to extract video title')
                         return
                 video_title = mobj.group(1).decode('utf-8')
-               simple_title = _simple_title(video_title)
+               simple_title = _simplify_title(video_title)
  
                 # Extract uploader
                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
@@ -2937,6 +2946,7 @@ class BlipTVIE(InfoExtractor):
                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
                                 basename = url.split('/')[-1]
                                 title,ext = os.path.splitext(basename)
+                               title = title.decode('UTF-8')
                                 ext = ext.replace('.', '')
                                 self.report_direct_download(title)
                                 info = {
@@ -3089,9 +3099,9 @@ class ComedyCentralIE(InfoExtractor):
  
                 if mobj.group('shortname'):
                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
-                               url = 'http://www.thedailyshow.com/full-episodes/'
+                               url = u'http://www.thedailyshow.com/full-episodes/'
                         else:
-                               url = 'http://www.colbertnation.com/full-episodes/'
+                               url = u'http://www.colbertnation.com/full-episodes/'
                         mobj = re.match(self._VALID_URL, url)
                         assert mobj is not None
  
@@ -3177,14 +3187,14 @@ class ComedyCentralIE(InfoExtractor):
  
                         self._downloader.increment_downloads()
  
-                       effTitle = showId + '-' + epTitle
+                       effTitle = showId + u'-' + epTitle
                         info = {
                                 'id': shortMediaId,
                                 'url': video_url,
                                 'uploader': showId,
                                 'upload_date': officialDate,
                                 'title': effTitle,
-                               'stitle': self._simplify_title(effTitle),
+                               'stitle': _simplify_title(effTitle),
                                 'ext': 'mp4',
                                 'format': format,
                                 'thumbnail': None,
@@ -3505,7 +3515,7 @@ class SoundcloudIE(InfoExtractor):
                 if mobj:
                         try:
                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
-                       except Exception as e:
+                       except Exception, e:
                                 print str(e)
  
                 # for soundcloud, a request to a cross domain is required for cookies
@@ -3606,6 +3616,127 @@ class InfoQIE(InfoExtractor):
                 except UnavailableVideoError, err:
                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
  
+class MixcloudIE(InfoExtractor):
+       """Information extractor for www.mixcloud.com"""
+       _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
+       IE_NAME = u'mixcloud'
+
+       def __init__(self, downloader=None):
+               InfoExtractor.__init__(self, downloader)
+
+       def report_download_json(self, file_id):
+               """Report JSON download."""
+               self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
+
+       def report_extraction(self, file_id):
+               """Report information extraction."""
+               self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
+
+       def get_urls(self, jsonData, fmt, bitrate='best'):
+               """Get urls from 'audio_formats' section in json"""
+               file_url = None
+               try:
+                       bitrate_list = jsonData[fmt]
+                       if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
+                               bitrate = max(bitrate_list) # select highest
+
+                       url_list = jsonData[fmt][bitrate]
+               except TypeError: # we have no bitrate info.
+                       url_list = jsonData[fmt]
+                               
+               return url_list
+
+       def check_urls(self, url_list):
+               """Returns 1st active url from list"""
+               for url in url_list:
+                       try:
+                               urllib2.urlopen(url)
+                               return url
+                       except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                               url = None
+
+               return None
+
+       def _print_formats(self, formats):
+               print 'Available formats:'
+               for fmt in formats.keys():
+                       for b in formats[fmt]:
+                               try:
+                                       ext = formats[fmt][b][0]
+                                       print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
+                               except TypeError: # we have no bitrate info
+                                       ext = formats[fmt][0]
+                                       print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
+                                       break
+
+       def _real_extract(self, url):
+               mobj = re.match(self._VALID_URL, url)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+                       return
+               # extract uploader & filename from url
+               uploader = mobj.group(1).decode('utf-8')
+               file_id = uploader + "-" + mobj.group(2).decode('utf-8')
+
+               # construct API request
+               file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
+               # retrieve .json file with links to files
+               request = urllib2.Request(file_url)
+               try:
+                       self.report_download_json(file_url)
+                       jsonData = urllib2.urlopen(request).read()
+               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                       self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
+                       return
+
+               # parse JSON
+               json_data = json.loads(jsonData)
+               player_url = json_data['player_swf_url']
+               formats = dict(json_data['audio_formats'])
+
+               req_format = self._downloader.params.get('format', None)
+               bitrate = None
+
+               if self._downloader.params.get('listformats', None):
+                       self._print_formats(formats)
+                       return
+
+               if req_format is None or req_format == 'best':
+                       for format_param in formats.keys():
+                               url_list = self.get_urls(formats, format_param)
+                               # check urls
+                               file_url = self.check_urls(url_list)
+                               if file_url is not None:
+                                       break # got it!
+               else:
+                       if req_format not in formats.keys():
+                               self._downloader.trouble(u'ERROR: format is not available')
+                               return
+
+                       url_list = self.get_urls(formats, req_format)
+                       file_url = self.check_urls(url_list)
+                       format_param = req_format
+
+               # We have audio
+               self._downloader.increment_downloads()
+               try:
+                       # Process file information
+                       self._downloader.process_info({
+                               'id':           file_id.decode('utf-8'),
+                               'url':          file_url.decode('utf-8'),
+                               'uploader':     uploader.decode('utf-8'),
+                               'upload_date':  u'NA',
+                               'title':        json_data['name'],
+                               'stitle':       _simplify_title(json_data['name']),
+                               'ext':          file_url.split('.')[-1].decode('utf-8'),
+                               'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
+                               'thumbnail':    json_data['thumbnail_url'],
+                               'description':  json_data['description'],
+                               'player_url':   player_url.decode('utf-8'),
+                       })
+               except UnavailableVideoError, err:
+                       self._downloader.trouble(u'ERROR: unable to download file')
+
  
  
  class PostProcessor(object):
@@ -3873,6 +4004,7 @@ def parseOpts():
                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
+       selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
  
         authentication.add_option('-u', '--username',
                         dest='username', metavar='USERNAME', help='account username')
@@ -3929,7 +4061,7 @@ def parseOpts():
                         action='store_true', dest='autonumber',
                         help='number downloaded files starting from 00000', default=False)
         filesystem.add_option('-o', '--output',
-                       dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
+                       dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent')
         filesystem.add_option('-a', '--batch-file',
                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
         filesystem.add_option('-w', '--no-overwrites',
@@ -4006,6 +4138,7 @@ def gen_extractors():
                 XVideosIE(),
                 SoundcloudIE(),
                 InfoQIE(),
+               MixcloudIE(),
  
                 GenericIE()
         ]
@@ -4141,6 +4274,7 @@ def _real_main():
                 'writeinfojson': opts.writeinfojson,
                 'matchtitle': opts.matchtitle,
                 'rejecttitle': opts.rejecttitle,
+               'max_downloads': int(opts.max_downloads),
                 })
         for extractor in extractors:
                 fd.add_info_extractor(extractor)