2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
56 except ImportError: # Python 2.4
59 import cStringIO as StringIO
63 # parse_qs was moved from the cgi module to the urlparse module recently.
65 from urlparse import parse_qs
67 from cgi import parse_qs
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
102 raiseError('Premature end', i)
104 def decodeEscape(match):
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
132 while s[e-bslashes-1] == '\\':
134 if bslashes % 2 == 1:
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
145 if s[i] == '}': # Empty dictionary
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
160 raiseError('Expected comma or closing curly brace', i)
165 if s[i] == ']': # Empty array
170 i = skipSpace(i) # Raise exception if premature end
174 raiseError('Expected a comma or closing bracket', i)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
180 raiseError('Not a boolean (or null)', i)
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184 raiseError('Not a number', i)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
200 def preferredencoding():
201 """Get preferred encoding.
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
206 def yield_preferredencoding():
208 pref = locale.getpreferredencoding()
214 return yield_preferredencoding().next()
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
223 entity = matchobj.group(1)
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
235 numstr = u'0%s' % numstr
238 return unichr(long(numstr, base))
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
258 It returns the tuple (stream, definitive_file_name).
262 if sys.platform == 'win32':
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
297 def _unescapeHTML(s):
299 @param s a string (of type unicode)
301 assert type(s) == type(u'')
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
306 def _encodeFilename(s):
308 @param s The name of the file (of type unicode)
311 assert type(s) == type(u'')
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
321 class DownloadError(Exception):
322 """Download Error exception.
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
331 class SameFileError(Exception):
332 """Same File exception.
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
340 class PostProcessingError(Exception):
341 """Post Processing exception.
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
388 Part of this code was copied from:
390 http://techknack.net/python-urllib2-handlers/
392 Andrew Rowls, the author of that code, agreed to release it to the
399 return zlib.decompress(data, -zlib.MAX_WBITS)
401 return zlib.decompress(data)
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
411 def http_request(self, req):
412 for h in std_headers:
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
422 def http_response(self, req, resp):
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
437 class FileDownloader(object):
438 """File Downloader class.
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
493 writesubtitles: Write the video subtitles to a .srt file
499 _download_retcode = None
500 _num_downloads = None
503 def __init__(self, params):
504 """Create a FileDownloader object with the given options."""
507 self._download_retcode = 0
508 self._num_downloads = 0
509 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
513 def format_bytes(bytes):
516 if type(bytes) is str:
521 exponent = long(math.log(bytes, 1024.0))
522 suffix = 'bkMGTPEZY'[exponent]
523 converted = float(bytes) / float(1024 ** exponent)
524 return '%.2f%s' % (converted, suffix)
527 def calc_percent(byte_counter, data_len):
530 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
533 def calc_eta(start, now, total, current):
537 if current == 0 or dif < 0.001: # One millisecond
539 rate = float(current) / dif
540 eta = long((float(total) - float(current)) / rate)
541 (eta_mins, eta_secs) = divmod(eta, 60)
544 return '%02d:%02d' % (eta_mins, eta_secs)
547 def calc_speed(start, now, bytes):
549 if bytes == 0 or dif < 0.001: # One millisecond
550 return '%10s' % '---b/s'
551 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
554 def best_block_size(elapsed_time, bytes):
555 new_min = max(bytes / 2.0, 1.0)
556 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
557 if elapsed_time < 0.001:
559 rate = bytes / elapsed_time
567 def parse_bytes(bytestr):
568 """Parse a string indicating a byte quantity into a long integer."""
569 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
572 number = float(matchobj.group(1))
573 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
574 return long(round(number * multiplier))
576 def add_info_extractor(self, ie):
577 """Add an InfoExtractor object to the end of the list."""
579 ie.set_downloader(self)
581 def add_post_processor(self, pp):
582 """Add a PostProcessor object to the end of the chain."""
584 pp.set_downloader(self)
586 def to_screen(self, message, skip_eol=False):
587 """Print message to stdout if not in quiet mode."""
588 assert type(message) == type(u'')
589 if not self.params.get('quiet', False):
590 terminator = [u'\n', u''][skip_eol]
591 output = message + terminator
593 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
594 output = output.encode(preferredencoding(), 'ignore')
595 self._screen_file.write(output)
596 self._screen_file.flush()
598 def to_stderr(self, message):
599 """Print message to stderr."""
600 print >>sys.stderr, message.encode(preferredencoding())
602 def to_cons_title(self, message):
603 """Set console/terminal window title to message."""
604 if not self.params.get('consoletitle', False):
606 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
607 # c_wchar_p() might not be necessary if `message` is
608 # already of type unicode()
609 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
610 elif 'TERM' in os.environ:
611 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
613 def fixed_template(self):
614 """Checks if the output template is fixed."""
615 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
617 def trouble(self, message=None):
618 """Determine action to take when a download problem appears.
620 Depending on if the downloader has been configured to ignore
621 download errors or not, this method may throw an exception or
622 not when errors are found, after printing the message.
624 if message is not None:
625 self.to_stderr(message)
626 if not self.params.get('ignoreerrors', False):
627 raise DownloadError(message)
628 self._download_retcode = 1
630 def slow_down(self, start_time, byte_counter):
631 """Sleep if the download speed is over the rate limit."""
632 rate_limit = self.params.get('ratelimit', None)
633 if rate_limit is None or byte_counter == 0:
636 elapsed = now - start_time
639 speed = float(byte_counter) / elapsed
640 if speed > rate_limit:
641 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
643 def temp_name(self, filename):
644 """Returns a temporary filename for the given filename."""
645 if self.params.get('nopart', False) or filename == u'-' or \
646 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
648 return filename + u'.part'
650 def undo_temp_name(self, filename):
651 if filename.endswith(u'.part'):
652 return filename[:-len(u'.part')]
655 def try_rename(self, old_filename, new_filename):
657 if old_filename == new_filename:
659 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
660 except (IOError, OSError), err:
661 self.trouble(u'ERROR: unable to rename file')
663 def try_utime(self, filename, last_modified_hdr):
664 """Try to set the last-modified time of the given file."""
665 if last_modified_hdr is None:
667 if not os.path.isfile(_encodeFilename(filename)):
669 timestr = last_modified_hdr
672 filetime = timeconvert(timestr)
676 os.utime(filename, (time.time(), filetime))
681 def report_writedescription(self, descfn):
682 """ Report that the description file is being written """
683 self.to_screen(u'[info] Writing video description to: ' + descfn)
685 def report_writesubtitles(self, srtfn):
686 """ Report that the subtitles file is being written """
687 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
689 def report_writeinfojson(self, infofn):
690 """ Report that the metadata file has been written """
691 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
693 def report_destination(self, filename):
694 """Report destination filename."""
695 self.to_screen(u'[download] Destination: ' + filename)
697 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
698 """Report download progress."""
699 if self.params.get('noprogress', False):
701 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
702 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
703 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
704 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
706 def report_resuming_byte(self, resume_len):
707 """Report attempt to resume at given byte."""
708 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
710 def report_retry(self, count, retries):
711 """Report retry in case of HTTP error 5xx"""
712 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
714 def report_file_already_downloaded(self, file_name):
715 """Report file has already been fully downloaded."""
717 self.to_screen(u'[download] %s has already been downloaded' % file_name)
718 except (UnicodeEncodeError), err:
719 self.to_screen(u'[download] The file has already been downloaded')
721 def report_unable_to_resume(self):
722 """Report it was impossible to resume download."""
723 self.to_screen(u'[download] Unable to resume')
725 def report_finish(self):
726 """Report download finished."""
727 if self.params.get('noprogress', False):
728 self.to_screen(u'[download] Download completed')
732 def increment_downloads(self):
733 """Increment the ordinal that assigns a number to each file."""
734 self._num_downloads += 1
736 def prepare_filename(self, info_dict):
737 """Generate the output filename."""
739 template_dict = dict(info_dict)
740 template_dict['epoch'] = unicode(long(time.time()))
741 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
742 filename = self.params['outtmpl'] % template_dict
744 except (ValueError, KeyError), err:
745 self.trouble(u'ERROR: invalid system charset or erroneous output template')
748 def _match_entry(self, info_dict):
749 """ Returns None iff the file should be downloaded """
751 title = info_dict['title']
752 matchtitle = self.params.get('matchtitle', False)
753 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
754 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
755 rejecttitle = self.params.get('rejecttitle', False)
756 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
757 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
760 def process_info(self, info_dict):
761 """Process a single dictionary returned by an InfoExtractor."""
763 reason = self._match_entry(info_dict)
764 if reason is not None:
765 self.to_screen(u'[download] ' + reason)
768 max_downloads = self.params.get('max_downloads')
769 if max_downloads is not None:
770 if self._num_downloads > int(max_downloads):
771 raise MaxDownloadsReached()
773 filename = self.prepare_filename(info_dict)
776 if self.params.get('forcetitle', False):
777 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
778 if self.params.get('forceurl', False):
779 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
780 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
781 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
782 if self.params.get('forcedescription', False) and 'description' in info_dict:
783 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
784 if self.params.get('forcefilename', False) and filename is not None:
785 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
786 if self.params.get('forceformat', False):
787 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
789 # Do nothing else if in simulate mode
790 if self.params.get('simulate', False):
797 dn = os.path.dirname(_encodeFilename(filename))
798 if dn != '' and not os.path.exists(dn): # dn is already encoded
800 except (OSError, IOError), err:
801 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
804 if self.params.get('writedescription', False):
806 descfn = filename + u'.description'
807 self.report_writedescription(descfn)
808 descfile = open(_encodeFilename(descfn), 'wb')
810 descfile.write(info_dict['description'].encode('utf-8'))
813 except (OSError, IOError):
814 self.trouble(u'ERROR: Cannot write description file ' + descfn)
817 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
818 # subtitles download errors are already managed as troubles in relevant IE
819 # that way it will silently go on when used with unsupporting IE
821 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
822 self.report_writesubtitles(srtfn)
823 srtfile = open(_encodeFilename(srtfn), 'wb')
825 srtfile.write(info_dict['subtitles'].encode('utf-8'))
828 except (OSError, IOError):
829 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
832 if self.params.get('writeinfojson', False):
833 infofn = filename + u'.info.json'
834 self.report_writeinfojson(infofn)
837 except (NameError,AttributeError):
838 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
841 infof = open(_encodeFilename(infofn), 'wb')
843 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
844 json.dump(json_info_dict, infof)
847 except (OSError, IOError):
848 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
851 if not self.params.get('skip_download', False):
852 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
856 success = self._do_download(filename, info_dict)
857 except (OSError, IOError), err:
858 raise UnavailableVideoError
859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
860 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
862 except (ContentTooShortError, ), err:
863 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
868 self.post_process(filename, info_dict)
869 except (PostProcessingError), err:
870 self.trouble(u'ERROR: postprocessing: %s' % str(err))
873 def download(self, url_list):
874 """Download a given list of URLs."""
875 if len(url_list) > 1 and self.fixed_template():
876 raise SameFileError(self.params['outtmpl'])
879 suitable_found = False
881 # Go to next InfoExtractor if not suitable
882 if not ie.suitable(url):
885 # Suitable InfoExtractor found
886 suitable_found = True
888 # Extract information from URL and process it
891 # Suitable InfoExtractor had been found; go to next URL
894 if not suitable_found:
895 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
897 return self._download_retcode
899 def post_process(self, filename, ie_info):
900 """Run the postprocessing chain on the given file."""
902 info['filepath'] = filename
908 def _download_with_rtmpdump(self, filename, url, player_url):
909 self.report_destination(filename)
910 tmpfilename = self.temp_name(filename)
912 # Check for rtmpdump first
914 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
915 except (OSError, IOError):
916 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
919 # Download using rtmpdump. rtmpdump returns exit code 2 when
920 # the connection was interrumpted and resuming appears to be
921 # possible. This is part of rtmpdump's normal usage, AFAIK.
922 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
923 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
924 if self.params.get('verbose', False):
927 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
930 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
931 retval = subprocess.call(args)
932 while retval == 2 or retval == 1:
933 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
934 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
935 time.sleep(5.0) # This seems to be needed
936 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
937 cursize = os.path.getsize(_encodeFilename(tmpfilename))
938 if prevsize == cursize and retval == 1:
940 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
941 if prevsize == cursize and retval == 2 and cursize > 1024:
942 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
946 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
947 self.try_rename(tmpfilename, filename)
950 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
953 def _do_download(self, filename, info_dict):
954 url = info_dict['url']
955 player_url = info_dict.get('player_url', None)
957 # Check file already present
958 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
959 self.report_file_already_downloaded(filename)
962 # Attempt to download using rtmpdump
963 if url.startswith('rtmp'):
964 return self._download_with_rtmpdump(filename, url, player_url)
966 tmpfilename = self.temp_name(filename)
969 # Do not include the Accept-Encoding header
970 headers = {'Youtubedl-no-compression': 'True'}
971 basic_request = urllib2.Request(url, None, headers)
972 request = urllib2.Request(url, None, headers)
974 # Establish possible resume length
975 if os.path.isfile(_encodeFilename(tmpfilename)):
976 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
982 if self.params.get('continuedl', False):
983 self.report_resuming_byte(resume_len)
984 request.add_header('Range','bytes=%d-' % resume_len)
990 retries = self.params.get('retries', 0)
991 while count <= retries:
992 # Establish connection
994 if count == 0 and 'urlhandle' in info_dict:
995 data = info_dict['urlhandle']
996 data = urllib2.urlopen(request)
998 except (urllib2.HTTPError, ), err:
999 if (err.code < 500 or err.code >= 600) and err.code != 416:
1000 # Unexpected HTTP error
1002 elif err.code == 416:
1003 # Unable to resume (requested range not satisfiable)
1005 # Open the connection again without the range header
1006 data = urllib2.urlopen(basic_request)
1007 content_length = data.info()['Content-Length']
1008 except (urllib2.HTTPError, ), err:
1009 if err.code < 500 or err.code >= 600:
1012 # Examine the reported length
1013 if (content_length is not None and
1014 (resume_len - 100 < long(content_length) < resume_len + 100)):
1015 # The file had already been fully downloaded.
1016 # Explanation to the above condition: in issue #175 it was revealed that
1017 # YouTube sometimes adds or removes a few bytes from the end of the file,
1018 # changing the file size slightly and causing problems for some users. So
1019 # I decided to implement a suggested change and consider the file
1020 # completely downloaded if the file size differs less than 100 bytes from
1021 # the one in the hard drive.
1022 self.report_file_already_downloaded(filename)
1023 self.try_rename(tmpfilename, filename)
1026 # The length does not match, we start the download over
1027 self.report_unable_to_resume()
1032 if count <= retries:
1033 self.report_retry(count, retries)
1036 self.trouble(u'ERROR: giving up after %s retries' % retries)
1039 data_len = data.info().get('Content-length', None)
1040 if data_len is not None:
1041 data_len = long(data_len) + resume_len
1042 data_len_str = self.format_bytes(data_len)
1043 byte_counter = 0 + resume_len
1047 # Download and write
1048 before = time.time()
1049 data_block = data.read(block_size)
1051 if len(data_block) == 0:
1053 byte_counter += len(data_block)
1055 # Open file just in time
1058 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1059 assert stream is not None
1060 filename = self.undo_temp_name(tmpfilename)
1061 self.report_destination(filename)
1062 except (OSError, IOError), err:
1063 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1066 stream.write(data_block)
1067 except (IOError, OSError), err:
1068 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1070 block_size = self.best_block_size(after - before, len(data_block))
1073 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1074 if data_len is None:
1075 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1077 percent_str = self.calc_percent(byte_counter, data_len)
1078 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1079 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1082 self.slow_down(start, byte_counter - resume_len)
1085 self.trouble(u'\nERROR: Did not get any data blocks')
1088 self.report_finish()
1089 if data_len is not None and byte_counter != data_len:
1090 raise ContentTooShortError(byte_counter, long(data_len))
1091 self.try_rename(tmpfilename, filename)
1093 # Update file modification time
1094 if self.params.get('updatetime', True):
1095 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1100 class InfoExtractor(object):
1101 """Information Extractor class.
1103 Information extractors are the classes that, given a URL, extract
1104 information from the video (or videos) the URL refers to. This
1105 information includes the real video URL, the video title and simplified
1106 title, author and others. The information is stored in a dictionary
1107 which is then passed to the FileDownloader. The FileDownloader
1108 processes this information possibly downloading the video to the file
1109 system, among other possible outcomes. The dictionaries must include
1110 the following fields:
1112 id: Video identifier.
1113 url: Final video URL.
1114 uploader: Nickname of the video uploader.
1115 title: Literal title.
1116 stitle: Simplified title.
1117 ext: Video filename extension.
1118 format: Video format.
1119 player_url: SWF Player URL (may be None).
1121 The following fields are optional. Their primary purpose is to allow
1122 youtube-dl to serve as the backend for a video search function, such
1123 as the one in youtube2mp3. They are only used when their respective
1124 forced printing functions are called:
1126 thumbnail: Full URL to a video thumbnail image.
1127 description: One-line video description.
1129 Subclasses of this one should re-define the _real_initialize() and
1130 _real_extract() methods and define a _VALID_URL regexp.
1131 Probably, they should also be added to the list of extractors.
1137 def __init__(self, downloader=None):
1138 """Constructor. Receives an optional downloader."""
1140 self.set_downloader(downloader)
1142 def suitable(self, url):
1143 """Receives a URL and returns True if suitable for this IE."""
1144 return re.match(self._VALID_URL, url) is not None
1146 def initialize(self):
1147 """Initializes an instance (authentication, etc)."""
1149 self._real_initialize()
1152 def extract(self, url):
1153 """Extracts URL information and returns it in list of dicts."""
1155 return self._real_extract(url)
1157 def set_downloader(self, downloader):
1158 """Sets the downloader for this IE."""
1159 self._downloader = downloader
1161 def _real_initialize(self):
1162 """Real initialization process. Redefine in subclasses."""
1165 def _real_extract(self, url):
1166 """Real extraction process. Redefine in subclasses."""
1170 class YoutubeIE(InfoExtractor):
1171 """Information extractor for youtube.com."""
1173 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1174 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1175 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1176 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1177 _NETRC_MACHINE = 'youtube'
1178 # Listed in order of quality
1179 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1180 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1181 _video_extensions = {
1187 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1192 _video_dimensions = {
1207 IE_NAME = u'youtube'
1209 def report_lang(self):
1210 """Report attempt to set language."""
1211 self._downloader.to_screen(u'[youtube] Setting language')
1213 def report_login(self):
1214 """Report attempt to log in."""
1215 self._downloader.to_screen(u'[youtube] Logging in')
1217 def report_age_confirmation(self):
1218 """Report attempt to confirm age."""
1219 self._downloader.to_screen(u'[youtube] Confirming age')
1221 def report_video_webpage_download(self, video_id):
1222 """Report attempt to download video webpage."""
1223 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1225 def report_video_info_webpage_download(self, video_id):
1226 """Report attempt to download video info webpage."""
1227 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1229 def report_video_subtitles_download(self, video_id):
1230 """Report attempt to download video info webpage."""
1231 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1233 def report_information_extraction(self, video_id):
1234 """Report attempt to extract video information."""
1235 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1237 def report_unavailable_format(self, video_id, format):
1238 """Report extracted video URL."""
1239 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1241 def report_rtmp_download(self):
1242 """Indicate the download will use the RTMP protocol."""
1243 self._downloader.to_screen(u'[youtube] RTMP download detected')
1245 def _closed_captions_xml_to_srt(self, xml_string):
1247 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1248 # TODO parse xml instead of regex
1249 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1250 if not dur: dur = '4'
1251 start = float(start)
1252 end = start + float(dur)
1253 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1254 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1255 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1256 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1257 srt += str(n) + '\n'
1258 srt += start + ' --> ' + end + '\n'
1259 srt += caption + '\n\n'
1262 def _print_formats(self, formats):
1263 print 'Available formats:'
1265 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1267 def _real_initialize(self):
1268 if self._downloader is None:
1273 downloader_params = self._downloader.params
1275 # Attempt to use provided username and password or .netrc data
1276 if downloader_params.get('username', None) is not None:
1277 username = downloader_params['username']
1278 password = downloader_params['password']
1279 elif downloader_params.get('usenetrc', False):
1281 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1282 if info is not None:
1286 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1287 except (IOError, netrc.NetrcParseError), err:
1288 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1292 request = urllib2.Request(self._LANG_URL)
1295 urllib2.urlopen(request).read()
1296 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1297 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1300 # No authentication to be performed
1301 if username is None:
1306 'current_form': 'loginForm',
1308 'action_login': 'Log In',
1309 'username': username,
1310 'password': password,
1312 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1315 login_results = urllib2.urlopen(request).read()
1316 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1317 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1319 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1320 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1326 'action_confirm': 'Confirm',
1328 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1330 self.report_age_confirmation()
1331 age_results = urllib2.urlopen(request).read()
1332 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1333 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1336 def _real_extract(self, url):
1337 # Extract video id from URL
1338 mobj = re.match(self._VALID_URL, url)
1340 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1342 video_id = mobj.group(2)
1345 self.report_video_webpage_download(video_id)
1346 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1348 video_webpage = urllib2.urlopen(request).read()
1349 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1350 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1353 # Attempt to extract SWF player URL
1354 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1355 if mobj is not None:
1356 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1361 self.report_video_info_webpage_download(video_id)
1362 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1363 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1364 % (video_id, el_type))
1365 request = urllib2.Request(video_info_url)
1367 video_info_webpage = urllib2.urlopen(request).read()
1368 video_info = parse_qs(video_info_webpage)
1369 if 'token' in video_info:
1371 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1372 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1374 if 'token' not in video_info:
1375 if 'reason' in video_info:
1376 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1378 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1381 # Start extracting information
1382 self.report_information_extraction(video_id)
1385 if 'author' not in video_info:
1386 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1388 video_uploader = urllib.unquote_plus(video_info['author'][0])
1391 if 'title' not in video_info:
1392 self._downloader.trouble(u'ERROR: unable to extract video title')
1394 video_title = urllib.unquote_plus(video_info['title'][0])
1395 video_title = video_title.decode('utf-8')
1396 video_title = sanitize_title(video_title)
1399 simple_title = _simplify_title(video_title)
1402 if 'thumbnail_url' not in video_info:
1403 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1404 video_thumbnail = ''
1405 else: # don't panic if we can't find it
1406 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1410 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1411 if mobj is not None:
1412 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1413 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1414 for expression in format_expressions:
1416 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1424 video_description = u'No description available.'
1425 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1426 if mobj is not None:
1427 video_description = mobj.group(1).decode('utf-8')
1429 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1430 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1431 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1432 # TODO use another parser
1435 video_subtitles = None
1436 if self._downloader.params.get('writesubtitles', False):
1437 self.report_video_subtitles_download(video_id)
1438 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1440 srt_list = urllib2.urlopen(request).read()
1441 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1444 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1446 if 'en' in srt_lang_list: srt_lang = 'en'
1447 else: srt_lang = srt_lang_list[0] # TODO choose better and provide an override
1448 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1450 srt_xml = urllib2.urlopen(request).read()
1451 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1452 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1454 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1456 self._downloader.trouble(u'WARNING: video has no subtitles')
1459 video_token = urllib.unquote_plus(video_info['token'][0])
1461 # Decide which formats to download
1462 req_format = self._downloader.params.get('format', None)
1464 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1465 self.report_rtmp_download()
1466 video_url_list = [(None, video_info['conn'][0])]
1467 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1468 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1469 url_data = [parse_qs(uds) for uds in url_data_strs]
1470 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1471 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1473 format_limit = self._downloader.params.get('format_limit', None)
1474 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1475 if format_limit is not None and format_limit in available_formats:
1476 format_list = available_formats[available_formats.index(format_limit):]
1478 format_list = available_formats
1479 existing_formats = [x for x in format_list if x in url_map]
1480 if len(existing_formats) == 0:
1481 self._downloader.trouble(u'ERROR: no known formats available for video')
1483 if self._downloader.params.get('listformats', None):
1484 self._print_formats(existing_formats)
1486 if req_format is None or req_format == 'best':
1487 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1488 elif req_format == 'worst':
1489 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1490 elif req_format in ('-1', 'all'):
1491 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1493 # Specific formats. We pick the first in a slash-delimeted sequence.
1494 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1495 req_formats = req_format.split('/')
1496 video_url_list = None
1497 for rf in req_formats:
1499 video_url_list = [(rf, url_map[rf])]
1501 if video_url_list is None:
1502 self._downloader.trouble(u'ERROR: requested format not available')
1505 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1508 for format_param, video_real_url in video_url_list:
1509 # At this point we have a new video
1510 self._downloader.increment_downloads()
1513 video_extension = self._video_extensions.get(format_param, 'flv')
1516 # Process video information
1517 self._downloader.process_info({
1518 'id': video_id.decode('utf-8'),
1519 'url': video_real_url.decode('utf-8'),
1520 'uploader': video_uploader.decode('utf-8'),
1521 'upload_date': upload_date,
1522 'title': video_title,
1523 'stitle': simple_title,
1524 'ext': video_extension.decode('utf-8'),
1525 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1526 'thumbnail': video_thumbnail.decode('utf-8'),
1527 'description': video_description,
1528 'player_url': player_url,
1529 'subtitles': video_subtitles
1531 except UnavailableVideoError, err:
1532 self._downloader.trouble(u'\nERROR: unable to download video')
1535 class MetacafeIE(InfoExtractor):
1536 """Information Extractor for metacafe.com."""
1538 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1539 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1540 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1542 IE_NAME = u'metacafe'
1544 def __init__(self, youtube_ie, downloader=None):
1545 InfoExtractor.__init__(self, downloader)
1546 self._youtube_ie = youtube_ie
1548 def report_disclaimer(self):
1549 """Report disclaimer retrieval."""
1550 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1552 def report_age_confirmation(self):
1553 """Report attempt to confirm age."""
1554 self._downloader.to_screen(u'[metacafe] Confirming age')
1556 def report_download_webpage(self, video_id):
1557 """Report webpage download."""
1558 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1560 def report_extraction(self, video_id):
1561 """Report information extraction."""
1562 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1564 def _real_initialize(self):
1565 # Retrieve disclaimer
1566 request = urllib2.Request(self._DISCLAIMER)
1568 self.report_disclaimer()
1569 disclaimer = urllib2.urlopen(request).read()
1570 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1571 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1577 'submit': "Continue - I'm over 18",
1579 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1581 self.report_age_confirmation()
1582 disclaimer = urllib2.urlopen(request).read()
1583 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1584 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1587 def _real_extract(self, url):
1588 # Extract id and simplified title from URL
1589 mobj = re.match(self._VALID_URL, url)
1591 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1594 video_id = mobj.group(1)
1596 # Check if video comes from YouTube
1597 mobj2 = re.match(r'^yt-(.*)$', video_id)
1598 if mobj2 is not None:
1599 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1602 # At this point we have a new video
1603 self._downloader.increment_downloads()
1605 simple_title = mobj.group(2).decode('utf-8')
1607 # Retrieve video webpage to extract further information
1608 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1610 self.report_download_webpage(video_id)
1611 webpage = urllib2.urlopen(request).read()
1612 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1613 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1616 # Extract URL, uploader and title from webpage
1617 self.report_extraction(video_id)
1618 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1619 if mobj is not None:
1620 mediaURL = urllib.unquote(mobj.group(1))
1621 video_extension = mediaURL[-3:]
1623 # Extract gdaKey if available
1624 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1626 video_url = mediaURL
1628 gdaKey = mobj.group(1)
1629 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1631 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1633 self._downloader.trouble(u'ERROR: unable to extract media URL')
1635 vardict = parse_qs(mobj.group(1))
1636 if 'mediaData' not in vardict:
1637 self._downloader.trouble(u'ERROR: unable to extract media URL')
1639 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1641 self._downloader.trouble(u'ERROR: unable to extract media URL')
1643 mediaURL = mobj.group(1).replace('\\/', '/')
1644 video_extension = mediaURL[-3:]
1645 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1647 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1649 self._downloader.trouble(u'ERROR: unable to extract title')
1651 video_title = mobj.group(1).decode('utf-8')
1652 video_title = sanitize_title(video_title)
1654 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1656 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1658 video_uploader = mobj.group(1)
1661 # Process video information
1662 self._downloader.process_info({
1663 'id': video_id.decode('utf-8'),
1664 'url': video_url.decode('utf-8'),
1665 'uploader': video_uploader.decode('utf-8'),
1666 'upload_date': u'NA',
1667 'title': video_title,
1668 'stitle': simple_title,
1669 'ext': video_extension.decode('utf-8'),
1673 except UnavailableVideoError:
1674 self._downloader.trouble(u'\nERROR: unable to download video')
1677 class DailymotionIE(InfoExtractor):
1678 """Information Extractor for Dailymotion"""
1680 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1681 IE_NAME = u'dailymotion'
1683 def __init__(self, downloader=None):
1684 InfoExtractor.__init__(self, downloader)
1686 def report_download_webpage(self, video_id):
1687 """Report webpage download."""
1688 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1690 def report_extraction(self, video_id):
1691 """Report information extraction."""
1692 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1694 def _real_extract(self, url):
1695 # Extract id and simplified title from URL
1696 mobj = re.match(self._VALID_URL, url)
1698 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1701 # At this point we have a new video
1702 self._downloader.increment_downloads()
1703 video_id = mobj.group(1)
1705 video_extension = 'flv'
1707 # Retrieve video webpage to extract further information
1708 request = urllib2.Request(url)
1709 request.add_header('Cookie', 'family_filter=off')
1711 self.report_download_webpage(video_id)
1712 webpage = urllib2.urlopen(request).read()
1713 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1714 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1717 # Extract URL, uploader and title from webpage
1718 self.report_extraction(video_id)
1719 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1721 self._downloader.trouble(u'ERROR: unable to extract media URL')
1723 sequence = urllib.unquote(mobj.group(1))
1724 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1726 self._downloader.trouble(u'ERROR: unable to extract media URL')
1728 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1730 # if needed add http://www.dailymotion.com/ if relative URL
1732 video_url = mediaURL
1734 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1736 self._downloader.trouble(u'ERROR: unable to extract title')
1738 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1739 video_title = sanitize_title(video_title)
1740 simple_title = _simplify_title(video_title)
1742 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1744 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1746 video_uploader = mobj.group(1)
1749 # Process video information
1750 self._downloader.process_info({
1751 'id': video_id.decode('utf-8'),
1752 'url': video_url.decode('utf-8'),
1753 'uploader': video_uploader.decode('utf-8'),
1754 'upload_date': u'NA',
1755 'title': video_title,
1756 'stitle': simple_title,
1757 'ext': video_extension.decode('utf-8'),
1761 except UnavailableVideoError:
1762 self._downloader.trouble(u'\nERROR: unable to download video')
1765 class GoogleIE(InfoExtractor):
1766 """Information extractor for video.google.com."""
1768 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1769 IE_NAME = u'video.google'
1771 def __init__(self, downloader=None):
1772 InfoExtractor.__init__(self, downloader)
1774 def report_download_webpage(self, video_id):
1775 """Report webpage download."""
1776 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1778 def report_extraction(self, video_id):
1779 """Report information extraction."""
1780 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1782 def _real_extract(self, url):
1783 # Extract id from URL
1784 mobj = re.match(self._VALID_URL, url)
1786 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1789 # At this point we have a new video
1790 self._downloader.increment_downloads()
1791 video_id = mobj.group(1)
1793 video_extension = 'mp4'
1795 # Retrieve video webpage to extract further information
1796 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1798 self.report_download_webpage(video_id)
1799 webpage = urllib2.urlopen(request).read()
1800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1801 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1804 # Extract URL, uploader, and title from webpage
1805 self.report_extraction(video_id)
1806 mobj = re.search(r"download_url:'([^']+)'", webpage)
1808 video_extension = 'flv'
1809 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1811 self._downloader.trouble(u'ERROR: unable to extract media URL')
1813 mediaURL = urllib.unquote(mobj.group(1))
1814 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1815 mediaURL = mediaURL.replace('\\x26', '\x26')
1817 video_url = mediaURL
1819 mobj = re.search(r'<title>(.*)</title>', webpage)
1821 self._downloader.trouble(u'ERROR: unable to extract title')
1823 video_title = mobj.group(1).decode('utf-8')
1824 video_title = sanitize_title(video_title)
1825 simple_title = _simplify_title(video_title)
1827 # Extract video description
1828 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1830 self._downloader.trouble(u'ERROR: unable to extract video description')
1832 video_description = mobj.group(1).decode('utf-8')
1833 if not video_description:
1834 video_description = 'No description available.'
1836 # Extract video thumbnail
1837 if self._downloader.params.get('forcethumbnail', False):
1838 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1840 webpage = urllib2.urlopen(request).read()
1841 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1842 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1844 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1846 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1848 video_thumbnail = mobj.group(1)
1849 else: # we need something to pass to process_info
1850 video_thumbnail = ''
1853 # Process video information
1854 self._downloader.process_info({
1855 'id': video_id.decode('utf-8'),
1856 'url': video_url.decode('utf-8'),
1858 'upload_date': u'NA',
1859 'title': video_title,
1860 'stitle': simple_title,
1861 'ext': video_extension.decode('utf-8'),
1865 except UnavailableVideoError:
1866 self._downloader.trouble(u'\nERROR: unable to download video')
1869 class PhotobucketIE(InfoExtractor):
1870 """Information extractor for photobucket.com."""
1872 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1873 IE_NAME = u'photobucket'
1875 def __init__(self, downloader=None):
1876 InfoExtractor.__init__(self, downloader)
1878 def report_download_webpage(self, video_id):
1879 """Report webpage download."""
1880 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1882 def report_extraction(self, video_id):
1883 """Report information extraction."""
1884 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1886 def _real_extract(self, url):
1887 # Extract id from URL
1888 mobj = re.match(self._VALID_URL, url)
1890 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1893 # At this point we have a new video
1894 self._downloader.increment_downloads()
1895 video_id = mobj.group(1)
1897 video_extension = 'flv'
1899 # Retrieve video webpage to extract further information
1900 request = urllib2.Request(url)
1902 self.report_download_webpage(video_id)
1903 webpage = urllib2.urlopen(request).read()
1904 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1905 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1908 # Extract URL, uploader, and title from webpage
1909 self.report_extraction(video_id)
1910 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1912 self._downloader.trouble(u'ERROR: unable to extract media URL')
1914 mediaURL = urllib.unquote(mobj.group(1))
1916 video_url = mediaURL
1918 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1920 self._downloader.trouble(u'ERROR: unable to extract title')
1922 video_title = mobj.group(1).decode('utf-8')
1923 video_title = sanitize_title(video_title)
1924 simple_title = _simplify_title(vide_title)
1926 video_uploader = mobj.group(2).decode('utf-8')
1929 # Process video information
1930 self._downloader.process_info({
1931 'id': video_id.decode('utf-8'),
1932 'url': video_url.decode('utf-8'),
1933 'uploader': video_uploader,
1934 'upload_date': u'NA',
1935 'title': video_title,
1936 'stitle': simple_title,
1937 'ext': video_extension.decode('utf-8'),
1941 except UnavailableVideoError:
1942 self._downloader.trouble(u'\nERROR: unable to download video')
1945 class YahooIE(InfoExtractor):
1946 """Information extractor for video.yahoo.com."""
1948 # _VALID_URL matches all Yahoo! Video URLs
1949 # _VPAGE_URL matches only the extractable '/watch/' URLs
1950 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1951 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1952 IE_NAME = u'video.yahoo'
1954 def __init__(self, downloader=None):
1955 InfoExtractor.__init__(self, downloader)
1957 def report_download_webpage(self, video_id):
1958 """Report webpage download."""
1959 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1961 def report_extraction(self, video_id):
1962 """Report information extraction."""
1963 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1965 def _real_extract(self, url, new_video=True):
1966 # Extract ID from URL
1967 mobj = re.match(self._VALID_URL, url)
1969 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1972 # At this point we have a new video
1973 self._downloader.increment_downloads()
1974 video_id = mobj.group(2)
1975 video_extension = 'flv'
1977 # Rewrite valid but non-extractable URLs as
1978 # extractable English language /watch/ URLs
1979 if re.match(self._VPAGE_URL, url) is None:
1980 request = urllib2.Request(url)
1982 webpage = urllib2.urlopen(request).read()
1983 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1984 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1987 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1989 self._downloader.trouble(u'ERROR: Unable to extract id field')
1991 yahoo_id = mobj.group(1)
1993 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1995 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1997 yahoo_vid = mobj.group(1)
1999 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2000 return self._real_extract(url, new_video=False)
2002 # Retrieve video webpage to extract further information
2003 request = urllib2.Request(url)
2005 self.report_download_webpage(video_id)
2006 webpage = urllib2.urlopen(request).read()
2007 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2008 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2011 # Extract uploader and title from webpage
2012 self.report_extraction(video_id)
2013 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2015 self._downloader.trouble(u'ERROR: unable to extract video title')
2017 video_title = mobj.group(1).decode('utf-8')
2018 simple_title = _simplify_title(video_title)
2020 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2022 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2024 video_uploader = mobj.group(1).decode('utf-8')
2026 # Extract video thumbnail
2027 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2029 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2031 video_thumbnail = mobj.group(1).decode('utf-8')
2033 # Extract video description
2034 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2036 self._downloader.trouble(u'ERROR: unable to extract video description')
2038 video_description = mobj.group(1).decode('utf-8')
2039 if not video_description:
2040 video_description = 'No description available.'
2042 # Extract video height and width
2043 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2045 self._downloader.trouble(u'ERROR: unable to extract video height')
2047 yv_video_height = mobj.group(1)
2049 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2051 self._downloader.trouble(u'ERROR: unable to extract video width')
2053 yv_video_width = mobj.group(1)
2055 # Retrieve video playlist to extract media URL
2056 # I'm not completely sure what all these options are, but we
2057 # seem to need most of them, otherwise the server sends a 401.
2058 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2059 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2060 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2061 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2062 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2064 self.report_download_webpage(video_id)
2065 webpage = urllib2.urlopen(request).read()
2066 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2067 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2070 # Extract media URL from playlist XML
2071 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2073 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2075 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2076 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2079 # Process video information
2080 self._downloader.process_info({
2081 'id': video_id.decode('utf-8'),
2083 'uploader': video_uploader,
2084 'upload_date': u'NA',
2085 'title': video_title,
2086 'stitle': simple_title,
2087 'ext': video_extension.decode('utf-8'),
2088 'thumbnail': video_thumbnail.decode('utf-8'),
2089 'description': video_description,
2090 'thumbnail': video_thumbnail,
2093 except UnavailableVideoError:
2094 self._downloader.trouble(u'\nERROR: unable to download video')
2097 class VimeoIE(InfoExtractor):
2098 """Information extractor for vimeo.com."""
2100 # _VALID_URL matches Vimeo URLs
2101 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2104 def __init__(self, downloader=None):
2105 InfoExtractor.__init__(self, downloader)
2107 def report_download_webpage(self, video_id):
2108 """Report webpage download."""
2109 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2111 def report_extraction(self, video_id):
2112 """Report information extraction."""
2113 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2115 def _real_extract(self, url, new_video=True):
2116 # Extract ID from URL
2117 mobj = re.match(self._VALID_URL, url)
2119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2122 # At this point we have a new video
2123 self._downloader.increment_downloads()
2124 video_id = mobj.group(1)
2126 # Retrieve video webpage to extract further information
2127 request = urllib2.Request(url, None, std_headers)
2129 self.report_download_webpage(video_id)
2130 webpage = urllib2.urlopen(request).read()
2131 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2132 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2135 # Now we begin extracting as much information as we can from what we
2136 # retrieved. First we extract the information common to all extractors,
2137 # and latter we extract those that are Vimeo specific.
2138 self.report_extraction(video_id)
2140 # Extract the config JSON
2141 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2143 config = json.loads(config)
2145 self._downloader.trouble(u'ERROR: unable to extract info section')
2149 video_title = config["video"]["title"]
2150 simple_title = _simplify_title(video_title)
2153 video_uploader = config["video"]["owner"]["name"]
2155 # Extract video thumbnail
2156 video_thumbnail = config["video"]["thumbnail"]
2158 # Extract video description
2162 video_description = u'No description available.'
2163 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2164 if mobj is not None:
2165 video_description = mobj.group(1)
2167 html_parser = lxml.etree.HTMLParser()
2168 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2169 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2170 # TODO use another parser
2172 # Extract upload date
2173 video_upload_date = u'NA'
2174 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2175 if mobj is not None:
2176 video_upload_date = mobj.group(1)
2178 # Vimeo specific: extract request signature and timestamp
2179 sig = config['request']['signature']
2180 timestamp = config['request']['timestamp']
2182 # Vimeo specific: extract video codec and quality information
2183 # TODO bind to format param
2184 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2185 for codec in codecs:
2186 if codec[0] in config["video"]["files"]:
2187 video_codec = codec[0]
2188 video_extension = codec[1]
2189 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2190 else: quality = 'sd'
2193 self._downloader.trouble(u'ERROR: no known codec found')
2196 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2197 %(video_id, sig, timestamp, quality, video_codec.upper())
2200 # Process video information
2201 self._downloader.process_info({
2204 'uploader': video_uploader,
2205 'upload_date': video_upload_date,
2206 'title': video_title,
2207 'stitle': simple_title,
2208 'ext': video_extension,
2209 'thumbnail': video_thumbnail,
2210 'description': video_description,
2213 except UnavailableVideoError:
2214 self._downloader.trouble(u'ERROR: unable to download video')
2217 class GenericIE(InfoExtractor):
2218 """Generic last-resort information extractor."""
2221 IE_NAME = u'generic'
2223 def __init__(self, downloader=None):
2224 InfoExtractor.__init__(self, downloader)
2226 def report_download_webpage(self, video_id):
2227 """Report webpage download."""
2228 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2229 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2231 def report_extraction(self, video_id):
2232 """Report information extraction."""
2233 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2235 def _real_extract(self, url):
2236 # At this point we have a new video
2237 self._downloader.increment_downloads()
2239 video_id = url.split('/')[-1]
2240 request = urllib2.Request(url)
2242 self.report_download_webpage(video_id)
2243 webpage = urllib2.urlopen(request).read()
2244 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2245 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2247 except ValueError, err:
2248 # since this is the last-resort InfoExtractor, if
2249 # this error is thrown, it'll be thrown here
2250 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2253 self.report_extraction(video_id)
2254 # Start with something easy: JW Player in SWFObject
2255 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2257 # Broaden the search a little bit
2258 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2260 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2263 # It's possible that one of the regexes
2264 # matched, but returned an empty group:
2265 if mobj.group(1) is None:
2266 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2269 video_url = urllib.unquote(mobj.group(1))
2270 video_id = os.path.basename(video_url)
2272 # here's a fun little line of code for you:
2273 video_extension = os.path.splitext(video_id)[1][1:]
2274 video_id = os.path.splitext(video_id)[0]
2276 # it's tempting to parse this further, but you would
2277 # have to take into account all the variations like
2278 # Video Title - Site Name
2279 # Site Name | Video Title
2280 # Video Title - Tagline | Site Name
2281 # and so on and so forth; it's just not practical
2282 mobj = re.search(r'<title>(.*)</title>', webpage)
2284 self._downloader.trouble(u'ERROR: unable to extract title')
2286 video_title = mobj.group(1).decode('utf-8')
2287 video_title = sanitize_title(video_title)
2288 simple_title = _simplify_title(video_title)
2290 # video uploader is domain name
2291 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2293 self._downloader.trouble(u'ERROR: unable to extract title')
2295 video_uploader = mobj.group(1).decode('utf-8')
2298 # Process video information
2299 self._downloader.process_info({
2300 'id': video_id.decode('utf-8'),
2301 'url': video_url.decode('utf-8'),
2302 'uploader': video_uploader,
2303 'upload_date': u'NA',
2304 'title': video_title,
2305 'stitle': simple_title,
2306 'ext': video_extension.decode('utf-8'),
2310 except UnavailableVideoError, err:
2311 self._downloader.trouble(u'\nERROR: unable to download video')
2314 class YoutubeSearchIE(InfoExtractor):
2315 """Information Extractor for YouTube search queries."""
2316 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2317 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2318 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2319 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2321 _max_youtube_results = 1000
2322 IE_NAME = u'youtube:search'
2324 def __init__(self, youtube_ie, downloader=None):
2325 InfoExtractor.__init__(self, downloader)
2326 self._youtube_ie = youtube_ie
2328 def report_download_page(self, query, pagenum):
2329 """Report attempt to download playlist page with given number."""
2330 query = query.decode(preferredencoding())
2331 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2333 def _real_initialize(self):
2334 self._youtube_ie.initialize()
2336 def _real_extract(self, query):
2337 mobj = re.match(self._VALID_URL, query)
2339 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2342 prefix, query = query.split(':')
2344 query = query.encode('utf-8')
2346 self._download_n_results(query, 1)
2348 elif prefix == 'all':
2349 self._download_n_results(query, self._max_youtube_results)
2355 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2357 elif n > self._max_youtube_results:
2358 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2359 n = self._max_youtube_results
2360 self._download_n_results(query, n)
2362 except ValueError: # parsing prefix as integer fails
2363 self._download_n_results(query, 1)
2366 def _download_n_results(self, query, n):
2367 """Downloads a specified number of results for a query"""
2370 already_seen = set()
2374 self.report_download_page(query, pagenum)
2375 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2376 request = urllib2.Request(result_url)
2378 page = urllib2.urlopen(request).read()
2379 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2380 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2383 # Extract video identifiers
2384 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2385 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2386 if video_id not in already_seen:
2387 video_ids.append(video_id)
2388 already_seen.add(video_id)
2389 if len(video_ids) == n:
2390 # Specified n videos reached
2391 for id in video_ids:
2392 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2395 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2396 for id in video_ids:
2397 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2400 pagenum = pagenum + 1
2403 class GoogleSearchIE(InfoExtractor):
2404 """Information Extractor for Google Video search queries."""
2405 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2406 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2407 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2408 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2410 _max_google_results = 1000
2411 IE_NAME = u'video.google:search'
2413 def __init__(self, google_ie, downloader=None):
2414 InfoExtractor.__init__(self, downloader)
2415 self._google_ie = google_ie
2417 def report_download_page(self, query, pagenum):
2418 """Report attempt to download playlist page with given number."""
2419 query = query.decode(preferredencoding())
2420 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2422 def _real_initialize(self):
2423 self._google_ie.initialize()
2425 def _real_extract(self, query):
2426 mobj = re.match(self._VALID_URL, query)
2428 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2431 prefix, query = query.split(':')
2433 query = query.encode('utf-8')
2435 self._download_n_results(query, 1)
2437 elif prefix == 'all':
2438 self._download_n_results(query, self._max_google_results)
2444 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2446 elif n > self._max_google_results:
2447 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2448 n = self._max_google_results
2449 self._download_n_results(query, n)
2451 except ValueError: # parsing prefix as integer fails
2452 self._download_n_results(query, 1)
2455 def _download_n_results(self, query, n):
2456 """Downloads a specified number of results for a query"""
2462 self.report_download_page(query, pagenum)
2463 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2464 request = urllib2.Request(result_url)
2466 page = urllib2.urlopen(request).read()
2467 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2471 # Extract video identifiers
2472 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2473 video_id = mobj.group(1)
2474 if video_id not in video_ids:
2475 video_ids.append(video_id)
2476 if len(video_ids) == n:
2477 # Specified n videos reached
2478 for id in video_ids:
2479 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2482 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2483 for id in video_ids:
2484 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2487 pagenum = pagenum + 1
2490 class YahooSearchIE(InfoExtractor):
2491 """Information Extractor for Yahoo! Video search queries."""
2492 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2493 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2494 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2495 _MORE_PAGES_INDICATOR = r'\s*Next'
2497 _max_yahoo_results = 1000
2498 IE_NAME = u'video.yahoo:search'
2500 def __init__(self, yahoo_ie, downloader=None):
2501 InfoExtractor.__init__(self, downloader)
2502 self._yahoo_ie = yahoo_ie
2504 def report_download_page(self, query, pagenum):
2505 """Report attempt to download playlist page with given number."""
2506 query = query.decode(preferredencoding())
2507 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2509 def _real_initialize(self):
2510 self._yahoo_ie.initialize()
2512 def _real_extract(self, query):
2513 mobj = re.match(self._VALID_URL, query)
2515 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2518 prefix, query = query.split(':')
2520 query = query.encode('utf-8')
2522 self._download_n_results(query, 1)
2524 elif prefix == 'all':
2525 self._download_n_results(query, self._max_yahoo_results)
2531 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2533 elif n > self._max_yahoo_results:
2534 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2535 n = self._max_yahoo_results
2536 self._download_n_results(query, n)
2538 except ValueError: # parsing prefix as integer fails
2539 self._download_n_results(query, 1)
2542 def _download_n_results(self, query, n):
2543 """Downloads a specified number of results for a query"""
2546 already_seen = set()
2550 self.report_download_page(query, pagenum)
2551 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2552 request = urllib2.Request(result_url)
2554 page = urllib2.urlopen(request).read()
2555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2556 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2559 # Extract video identifiers
2560 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2561 video_id = mobj.group(1)
2562 if video_id not in already_seen:
2563 video_ids.append(video_id)
2564 already_seen.add(video_id)
2565 if len(video_ids) == n:
2566 # Specified n videos reached
2567 for id in video_ids:
2568 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2571 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2572 for id in video_ids:
2573 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2576 pagenum = pagenum + 1
2579 class YoutubePlaylistIE(InfoExtractor):
2580 """Information Extractor for YouTube playlists."""
2582 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2583 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2584 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2585 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2587 IE_NAME = u'youtube:playlist'
2589 def __init__(self, youtube_ie, downloader=None):
2590 InfoExtractor.__init__(self, downloader)
2591 self._youtube_ie = youtube_ie
2593 def report_download_page(self, playlist_id, pagenum):
2594 """Report attempt to download playlist page with given number."""
2595 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2597 def _real_initialize(self):
2598 self._youtube_ie.initialize()
2600 def _real_extract(self, url):
2601 # Extract playlist id
2602 mobj = re.match(self._VALID_URL, url)
2604 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2608 if mobj.group(3) is not None:
2609 self._youtube_ie.extract(mobj.group(3))
2612 # Download playlist pages
2613 # prefix is 'p' as default for playlists but there are other types that need extra care
2614 playlist_prefix = mobj.group(1)
2615 if playlist_prefix == 'a':
2616 playlist_access = 'artist'
2618 playlist_prefix = 'p'
2619 playlist_access = 'view_play_list'
2620 playlist_id = mobj.group(2)
2625 self.report_download_page(playlist_id, pagenum)
2626 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2627 request = urllib2.Request(url)
2629 page = urllib2.urlopen(request).read()
2630 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2631 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2634 # Extract video identifiers
2636 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2637 if mobj.group(1) not in ids_in_page:
2638 ids_in_page.append(mobj.group(1))
2639 video_ids.extend(ids_in_page)
2641 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2643 pagenum = pagenum + 1
2645 playliststart = self._downloader.params.get('playliststart', 1) - 1
2646 playlistend = self._downloader.params.get('playlistend', -1)
2647 video_ids = video_ids[playliststart:playlistend]
2649 for id in video_ids:
2650 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2654 class YoutubeUserIE(InfoExtractor):
2655 """Information Extractor for YouTube users."""
2657 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2658 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2659 _GDATA_PAGE_SIZE = 50
2660 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2661 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2663 IE_NAME = u'youtube:user'
2665 def __init__(self, youtube_ie, downloader=None):
2666 InfoExtractor.__init__(self, downloader)
2667 self._youtube_ie = youtube_ie
2669 def report_download_page(self, username, start_index):
2670 """Report attempt to download user page."""
2671 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2672 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2674 def _real_initialize(self):
2675 self._youtube_ie.initialize()
2677 def _real_extract(self, url):
2679 mobj = re.match(self._VALID_URL, url)
2681 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2684 username = mobj.group(1)
2686 # Download video ids using YouTube Data API. Result size per
2687 # query is limited (currently to 50 videos) so we need to query
2688 # page by page until there are no video ids - it means we got
2695 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2696 self.report_download_page(username, start_index)
2698 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2701 page = urllib2.urlopen(request).read()
2702 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2703 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2706 # Extract video identifiers
2709 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2710 if mobj.group(1) not in ids_in_page:
2711 ids_in_page.append(mobj.group(1))
2713 video_ids.extend(ids_in_page)
2715 # A little optimization - if current page is not
2716 # "full", ie. does not contain PAGE_SIZE video ids then
2717 # we can assume that this page is the last one - there
2718 # are no more ids on further pages - no need to query
2721 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2726 all_ids_count = len(video_ids)
2727 playliststart = self._downloader.params.get('playliststart', 1) - 1
2728 playlistend = self._downloader.params.get('playlistend', -1)
2730 if playlistend == -1:
2731 video_ids = video_ids[playliststart:]
2733 video_ids = video_ids[playliststart:playlistend]
2735 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2736 (username, all_ids_count, len(video_ids)))
2738 for video_id in video_ids:
2739 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2742 class DepositFilesIE(InfoExtractor):
2743 """Information extractor for depositfiles.com"""
2745 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2746 IE_NAME = u'DepositFiles'
2748 def __init__(self, downloader=None):
2749 InfoExtractor.__init__(self, downloader)
2751 def report_download_webpage(self, file_id):
2752 """Report webpage download."""
2753 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2755 def report_extraction(self, file_id):
2756 """Report information extraction."""
2757 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2759 def _real_extract(self, url):
2760 # At this point we have a new file
2761 self._downloader.increment_downloads()
2763 file_id = url.split('/')[-1]
2764 # Rebuild url in english locale
2765 url = 'http://depositfiles.com/en/files/' + file_id
2767 # Retrieve file webpage with 'Free download' button pressed
2768 free_download_indication = { 'gateway_result' : '1' }
2769 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2771 self.report_download_webpage(file_id)
2772 webpage = urllib2.urlopen(request).read()
2773 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2774 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2777 # Search for the real file URL
2778 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2779 if (mobj is None) or (mobj.group(1) is None):
2780 # Try to figure out reason of the error.
2781 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2782 if (mobj is not None) and (mobj.group(1) is not None):
2783 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2784 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2786 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2789 file_url = mobj.group(1)
2790 file_extension = os.path.splitext(file_url)[1][1:]
2792 # Search for file title
2793 mobj = re.search(r'<b title="(.*?)">', webpage)
2795 self._downloader.trouble(u'ERROR: unable to extract title')
2797 file_title = mobj.group(1).decode('utf-8')
2800 # Process file information
2801 self._downloader.process_info({
2802 'id': file_id.decode('utf-8'),
2803 'url': file_url.decode('utf-8'),
2805 'upload_date': u'NA',
2806 'title': file_title,
2807 'stitle': file_title,
2808 'ext': file_extension.decode('utf-8'),
2812 except UnavailableVideoError, err:
2813 self._downloader.trouble(u'ERROR: unable to download file')
2816 class FacebookIE(InfoExtractor):
2817 """Information Extractor for Facebook"""
2819 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2820 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2821 _NETRC_MACHINE = 'facebook'
2822 _available_formats = ['video', 'highqual', 'lowqual']
2823 _video_extensions = {
2828 IE_NAME = u'facebook'
2830 def __init__(self, downloader=None):
2831 InfoExtractor.__init__(self, downloader)
2833 def _reporter(self, message):
2834 """Add header and report message."""
2835 self._downloader.to_screen(u'[facebook] %s' % message)
2837 def report_login(self):
2838 """Report attempt to log in."""
2839 self._reporter(u'Logging in')
2841 def report_video_webpage_download(self, video_id):
2842 """Report attempt to download video webpage."""
2843 self._reporter(u'%s: Downloading video webpage' % video_id)
2845 def report_information_extraction(self, video_id):
2846 """Report attempt to extract video information."""
2847 self._reporter(u'%s: Extracting video information' % video_id)
2849 def _parse_page(self, video_webpage):
2850 """Extract video information from page"""
2852 data = {'title': r'\("video_title", "(.*?)"\)',
2853 'description': r'<div class="datawrap">(.*?)</div>',
2854 'owner': r'\("video_owner_name", "(.*?)"\)',
2855 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2858 for piece in data.keys():
2859 mobj = re.search(data[piece], video_webpage)
2860 if mobj is not None:
2861 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2865 for fmt in self._available_formats:
2866 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2867 if mobj is not None:
2868 # URL is in a Javascript segment inside an escaped Unicode format within
2869 # the generally utf-8 page
2870 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2871 video_info['video_urls'] = video_urls
2875 def _real_initialize(self):
2876 if self._downloader is None:
2881 downloader_params = self._downloader.params
2883 # Attempt to use provided username and password or .netrc data
2884 if downloader_params.get('username', None) is not None:
2885 useremail = downloader_params['username']
2886 password = downloader_params['password']
2887 elif downloader_params.get('usenetrc', False):
2889 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2890 if info is not None:
2894 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2895 except (IOError, netrc.NetrcParseError), err:
2896 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2899 if useremail is None:
2908 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2911 login_results = urllib2.urlopen(request).read()
2912 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2913 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2915 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2916 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2919 def _real_extract(self, url):
2920 mobj = re.match(self._VALID_URL, url)
2922 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2924 video_id = mobj.group('ID')
2927 self.report_video_webpage_download(video_id)
2928 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2930 page = urllib2.urlopen(request)
2931 video_webpage = page.read()
2932 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2933 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2936 # Start extracting information
2937 self.report_information_extraction(video_id)
2939 # Extract information
2940 video_info = self._parse_page(video_webpage)
2943 if 'owner' not in video_info:
2944 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2946 video_uploader = video_info['owner']
2949 if 'title' not in video_info:
2950 self._downloader.trouble(u'ERROR: unable to extract video title')
2952 video_title = video_info['title']
2953 video_title = video_title.decode('utf-8')
2954 video_title = sanitize_title(video_title)
2956 simple_title = _simplify_title(video_title)
2959 if 'thumbnail' not in video_info:
2960 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2961 video_thumbnail = ''
2963 video_thumbnail = video_info['thumbnail']
2967 if 'upload_date' in video_info:
2968 upload_time = video_info['upload_date']
2969 timetuple = email.utils.parsedate_tz(upload_time)
2970 if timetuple is not None:
2972 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2977 video_description = video_info.get('description', 'No description available.')
2979 url_map = video_info['video_urls']
2980 if len(url_map.keys()) > 0:
2981 # Decide which formats to download
2982 req_format = self._downloader.params.get('format', None)
2983 format_limit = self._downloader.params.get('format_limit', None)
2985 if format_limit is not None and format_limit in self._available_formats:
2986 format_list = self._available_formats[self._available_formats.index(format_limit):]
2988 format_list = self._available_formats
2989 existing_formats = [x for x in format_list if x in url_map]
2990 if len(existing_formats) == 0:
2991 self._downloader.trouble(u'ERROR: no known formats available for video')
2993 if req_format is None:
2994 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2995 elif req_format == 'worst':
2996 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2997 elif req_format == '-1':
2998 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3001 if req_format not in url_map:
3002 self._downloader.trouble(u'ERROR: requested format not available')
3004 video_url_list = [(req_format, url_map[req_format])] # Specific format
3006 for format_param, video_real_url in video_url_list:
3008 # At this point we have a new video
3009 self._downloader.increment_downloads()
3012 video_extension = self._video_extensions.get(format_param, 'mp4')
3015 # Process video information
3016 self._downloader.process_info({
3017 'id': video_id.decode('utf-8'),
3018 'url': video_real_url.decode('utf-8'),
3019 'uploader': video_uploader.decode('utf-8'),
3020 'upload_date': upload_date,
3021 'title': video_title,
3022 'stitle': simple_title,
3023 'ext': video_extension.decode('utf-8'),
3024 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3025 'thumbnail': video_thumbnail.decode('utf-8'),
3026 'description': video_description.decode('utf-8'),
3029 except UnavailableVideoError, err:
3030 self._downloader.trouble(u'\nERROR: unable to download video')
3032 class BlipTVIE(InfoExtractor):
3033 """Information extractor for blip.tv"""
3035 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3036 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3037 IE_NAME = u'blip.tv'
3039 def report_extraction(self, file_id):
3040 """Report information extraction."""
3041 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3043 def report_direct_download(self, title):
3044 """Report information extraction."""
3045 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3047 def _real_extract(self, url):
3048 mobj = re.match(self._VALID_URL, url)
3050 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3057 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3058 request = urllib2.Request(json_url)
3059 self.report_extraction(mobj.group(1))
3062 urlh = urllib2.urlopen(request)
3063 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3064 basename = url.split('/')[-1]
3065 title,ext = os.path.splitext(basename)
3066 title = title.decode('UTF-8')
3067 ext = ext.replace('.', '')
3068 self.report_direct_download(title)
3073 'stitle': _simplify_title(title),
3077 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3078 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3080 if info is None: # Regular URL
3082 json_code = urlh.read()
3083 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3084 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3088 json_data = json.loads(json_code)
3089 if 'Post' in json_data:
3090 data = json_data['Post']
3094 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3095 video_url = data['media']['url']
3096 umobj = re.match(self._URL_EXT, video_url)
3098 raise ValueError('Can not determine filename extension')
3099 ext = umobj.group(1)
3102 'id': data['item_id'],
3104 'uploader': data['display_name'],
3105 'upload_date': upload_date,
3106 'title': data['title'],
3107 'stitle': _simplify_title(data['title']),
3109 'format': data['media']['mimeType'],
3110 'thumbnail': data['thumbnailUrl'],
3111 'description': data['description'],
3112 'player_url': data['embedUrl']
3114 except (ValueError,KeyError), err:
3115 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3118 self._downloader.increment_downloads()
3121 self._downloader.process_info(info)
3122 except UnavailableVideoError, err:
3123 self._downloader.trouble(u'\nERROR: unable to download video')
3126 class MyVideoIE(InfoExtractor):
3127 """Information Extractor for myvideo.de."""
3129 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3130 IE_NAME = u'myvideo'
3132 def __init__(self, downloader=None):
3133 InfoExtractor.__init__(self, downloader)
3135 def report_download_webpage(self, video_id):
3136 """Report webpage download."""
3137 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3139 def report_extraction(self, video_id):
3140 """Report information extraction."""
3141 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3143 def _real_extract(self,url):
3144 mobj = re.match(self._VALID_URL, url)
3146 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3149 video_id = mobj.group(1)
3152 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3154 self.report_download_webpage(video_id)
3155 webpage = urllib2.urlopen(request).read()
3156 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3157 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3160 self.report_extraction(video_id)
3161 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3164 self._downloader.trouble(u'ERROR: unable to extract media URL')
3166 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3168 mobj = re.search('<title>([^<]+)</title>', webpage)
3170 self._downloader.trouble(u'ERROR: unable to extract title')
3173 video_title = mobj.group(1)
3174 video_title = sanitize_title(video_title)
3176 simple_title = _simplify_title(video_title)
3179 self._downloader.process_info({
3183 'upload_date': u'NA',
3184 'title': video_title,
3185 'stitle': simple_title,
3190 except UnavailableVideoError:
3191 self._downloader.trouble(u'\nERROR: Unable to download video')
3193 class ComedyCentralIE(InfoExtractor):
3194 """Information extractor for The Daily Show and Colbert Report """
3196 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3197 IE_NAME = u'comedycentral'
3199 def report_extraction(self, episode_id):
3200 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3202 def report_config_download(self, episode_id):
3203 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3205 def report_index_download(self, episode_id):
3206 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3208 def report_player_url(self, episode_id):
3209 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3211 def _real_extract(self, url):
3212 mobj = re.match(self._VALID_URL, url)
3214 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3217 if mobj.group('shortname'):
3218 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3219 url = u'http://www.thedailyshow.com/full-episodes/'
3221 url = u'http://www.colbertnation.com/full-episodes/'
3222 mobj = re.match(self._VALID_URL, url)
3223 assert mobj is not None
3225 dlNewest = not mobj.group('episode')
3227 epTitle = mobj.group('showname')
3229 epTitle = mobj.group('episode')
3231 req = urllib2.Request(url)
3232 self.report_extraction(epTitle)
3234 htmlHandle = urllib2.urlopen(req)
3235 html = htmlHandle.read()
3236 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3237 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3240 url = htmlHandle.geturl()
3241 mobj = re.match(self._VALID_URL, url)
3243 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3245 if mobj.group('episode') == '':
3246 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3248 epTitle = mobj.group('episode')
3250 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3251 if len(mMovieParams) == 0:
3252 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3255 playerUrl_raw = mMovieParams[0][0]
3256 self.report_player_url(epTitle)
3258 urlHandle = urllib2.urlopen(playerUrl_raw)
3259 playerUrl = urlHandle.geturl()
3260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3261 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3264 uri = mMovieParams[0][1]
3265 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3266 self.report_index_download(epTitle)
3268 indexXml = urllib2.urlopen(indexUrl).read()
3269 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3270 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3273 idoc = xml.etree.ElementTree.fromstring(indexXml)
3274 itemEls = idoc.findall('.//item')
3275 for itemEl in itemEls:
3276 mediaId = itemEl.findall('./guid')[0].text
3277 shortMediaId = mediaId.split(':')[-1]
3278 showId = mediaId.split(':')[-2].replace('.com', '')
3279 officialTitle = itemEl.findall('./title')[0].text
3280 officialDate = itemEl.findall('./pubDate')[0].text
3282 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3283 urllib.urlencode({'uri': mediaId}))
3284 configReq = urllib2.Request(configUrl)
3285 self.report_config_download(epTitle)
3287 configXml = urllib2.urlopen(configReq).read()
3288 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3289 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3292 cdoc = xml.etree.ElementTree.fromstring(configXml)
3294 for rendition in cdoc.findall('.//rendition'):
3295 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3299 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3302 # For now, just pick the highest bitrate
3303 format,video_url = turls[-1]
3305 self._downloader.increment_downloads()
3307 effTitle = showId + u'-' + epTitle
3312 'upload_date': officialDate,
3314 'stitle': _simplify_title(effTitle),
3318 'description': officialTitle,
3319 'player_url': playerUrl
3323 self._downloader.process_info(info)
3324 except UnavailableVideoError, err:
3325 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3329 class EscapistIE(InfoExtractor):
3330 """Information extractor for The Escapist """
3332 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3333 IE_NAME = u'escapist'
3335 def report_extraction(self, showName):
3336 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3338 def report_config_download(self, showName):
3339 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3341 def _real_extract(self, url):
3342 htmlParser = HTMLParser.HTMLParser()
3344 mobj = re.match(self._VALID_URL, url)
3346 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3348 showName = mobj.group('showname')
3349 videoId = mobj.group('episode')
3351 self.report_extraction(showName)
3353 webPage = urllib2.urlopen(url).read()
3354 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3355 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3358 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3359 description = htmlParser.unescape(descMatch.group(1))
3360 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3361 imgUrl = htmlParser.unescape(imgMatch.group(1))
3362 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3363 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3364 configUrlMatch = re.search('config=(.*)$', playerUrl)
3365 configUrl = urllib2.unquote(configUrlMatch.group(1))
3367 self.report_config_download(showName)
3369 configJSON = urllib2.urlopen(configUrl).read()
3370 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3371 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3374 # Technically, it's JavaScript, not JSON
3375 configJSON = configJSON.replace("'", '"')
3378 config = json.loads(configJSON)
3379 except (ValueError,), err:
3380 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3383 playlist = config['playlist']
3384 videoUrl = playlist[1]['url']
3386 self._downloader.increment_downloads()
3390 'uploader': showName,
3391 'upload_date': None,
3393 'stitle': _simplify_title(showName),
3396 'thumbnail': imgUrl,
3397 'description': description,
3398 'player_url': playerUrl,
3402 self._downloader.process_info(info)
3403 except UnavailableVideoError, err:
3404 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3407 class CollegeHumorIE(InfoExtractor):
3408 """Information extractor for collegehumor.com"""
3410 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3411 IE_NAME = u'collegehumor'
3413 def report_webpage(self, video_id):
3414 """Report information extraction."""
3415 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3417 def report_extraction(self, video_id):
3418 """Report information extraction."""
3419 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3421 def _real_extract(self, url):
3422 htmlParser = HTMLParser.HTMLParser()
3424 mobj = re.match(self._VALID_URL, url)
3426 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3428 video_id = mobj.group('videoid')
3430 self.report_webpage(video_id)
3431 request = urllib2.Request(url)
3433 webpage = urllib2.urlopen(request).read()
3434 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3435 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3438 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3440 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3442 internal_video_id = m.group('internalvideoid')
3446 'internal_id': internal_video_id,
3449 self.report_extraction(video_id)
3450 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3452 metaXml = urllib2.urlopen(xmlUrl).read()
3453 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3454 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3457 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3459 videoNode = mdoc.findall('./video')[0]
3460 info['description'] = videoNode.findall('./description')[0].text
3461 info['title'] = videoNode.findall('./caption')[0].text
3462 info['stitle'] = _simplify_title(info['title'])
3463 info['url'] = videoNode.findall('./file')[0].text
3464 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3465 info['ext'] = info['url'].rpartition('.')[2]
3466 info['format'] = info['ext']
3468 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3471 self._downloader.increment_downloads()
3474 self._downloader.process_info(info)
3475 except UnavailableVideoError, err:
3476 self._downloader.trouble(u'\nERROR: unable to download video')
3479 class XVideosIE(InfoExtractor):
3480 """Information extractor for xvideos.com"""
3482 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3483 IE_NAME = u'xvideos'
3485 def report_webpage(self, video_id):
3486 """Report information extraction."""
3487 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3489 def report_extraction(self, video_id):
3490 """Report information extraction."""
3491 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3493 def _real_extract(self, url):
3494 htmlParser = HTMLParser.HTMLParser()
3496 mobj = re.match(self._VALID_URL, url)
3498 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3500 video_id = mobj.group(1).decode('utf-8')
3502 self.report_webpage(video_id)
3504 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3506 webpage = urllib2.urlopen(request).read()
3507 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3508 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3511 self.report_extraction(video_id)
3515 mobj = re.search(r'flv_url=(.+?)&', webpage)
3517 self._downloader.trouble(u'ERROR: unable to extract video url')
3519 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3523 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3525 self._downloader.trouble(u'ERROR: unable to extract video title')
3527 video_title = mobj.group(1).decode('utf-8')
3530 # Extract video thumbnail
3531 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3533 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3535 video_thumbnail = mobj.group(1).decode('utf-8')
3539 self._downloader.increment_downloads()
3544 'upload_date': None,
3545 'title': video_title,
3546 'stitle': _simplify_title(video_title),
3549 'thumbnail': video_thumbnail,
3550 'description': None,
3555 self._downloader.process_info(info)
3556 except UnavailableVideoError, err:
3557 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3560 class SoundcloudIE(InfoExtractor):
3561 """Information extractor for soundcloud.com
3562 To access the media, the uid of the song and a stream token
3563 must be extracted from the page source and the script must make
3564 a request to media.soundcloud.com/crossdomain.xml. Then
3565 the media can be grabbed by requesting from an url composed
3566 of the stream token and uid
3569 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3570 IE_NAME = u'soundcloud'
3572 def __init__(self, downloader=None):
3573 InfoExtractor.__init__(self, downloader)
3575 def report_webpage(self, video_id):
3576 """Report information extraction."""
3577 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3579 def report_extraction(self, video_id):
3580 """Report information extraction."""
3581 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3583 def _real_extract(self, url):
3584 htmlParser = HTMLParser.HTMLParser()
3586 mobj = re.match(self._VALID_URL, url)
3588 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3591 # extract uploader (which is in the url)
3592 uploader = mobj.group(1).decode('utf-8')
3593 # extract simple title (uploader + slug of song title)
3594 slug_title = mobj.group(2).decode('utf-8')
3595 simple_title = uploader + '-' + slug_title
3597 self.report_webpage('%s/%s' % (uploader, slug_title))
3599 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3601 webpage = urllib2.urlopen(request).read()
3602 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3603 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3606 self.report_extraction('%s/%s' % (uploader, slug_title))
3608 # extract uid and stream token that soundcloud hands out for access
3609 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3611 video_id = mobj.group(1)
3612 stream_token = mobj.group(2)
3614 # extract unsimplified title
3615 mobj = re.search('"title":"(.*?)",', webpage)
3617 title = mobj.group(1)
3619 # construct media url (with uid/token)
3620 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3621 mediaURL = mediaURL % (video_id, stream_token)
3624 description = u'No description available'
3625 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3627 description = mobj.group(1)
3631 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3634 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3635 except Exception, e:
3638 # for soundcloud, a request to a cross domain is required for cookies
3639 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3642 self._downloader.process_info({
3643 'id': video_id.decode('utf-8'),
3645 'uploader': uploader.decode('utf-8'),
3646 'upload_date': upload_date,
3647 'title': simple_title.decode('utf-8'),
3648 'stitle': simple_title.decode('utf-8'),
3652 'description': description.decode('utf-8')
3654 except UnavailableVideoError:
3655 self._downloader.trouble(u'\nERROR: unable to download video')
3658 class InfoQIE(InfoExtractor):
3659 """Information extractor for infoq.com"""
3661 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3664 def report_webpage(self, video_id):
3665 """Report information extraction."""
3666 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3668 def report_extraction(self, video_id):
3669 """Report information extraction."""
3670 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3672 def _real_extract(self, url):
3673 htmlParser = HTMLParser.HTMLParser()
3675 mobj = re.match(self._VALID_URL, url)
3677 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3680 self.report_webpage(url)
3682 request = urllib2.Request(url)
3684 webpage = urllib2.urlopen(request).read()
3685 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3686 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3689 self.report_extraction(url)
3693 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3695 self._downloader.trouble(u'ERROR: unable to extract video url')
3697 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3701 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3703 self._downloader.trouble(u'ERROR: unable to extract video title')
3705 video_title = mobj.group(1).decode('utf-8')
3707 # Extract description
3708 video_description = u'No description available.'
3709 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3710 if mobj is not None:
3711 video_description = mobj.group(1).decode('utf-8')
3713 video_filename = video_url.split('/')[-1]
3714 video_id, extension = video_filename.split('.')
3716 self._downloader.increment_downloads()
3721 'upload_date': None,
3722 'title': video_title,
3723 'stitle': _simplify_title(video_title),
3725 'format': extension, # Extension is always(?) mp4, but seems to be flv
3727 'description': video_description,
3732 self._downloader.process_info(info)
3733 except UnavailableVideoError, err:
3734 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3736 class MixcloudIE(InfoExtractor):
3737 """Information extractor for www.mixcloud.com"""
3738 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3739 IE_NAME = u'mixcloud'
3741 def __init__(self, downloader=None):
3742 InfoExtractor.__init__(self, downloader)
3744 def report_download_json(self, file_id):
3745 """Report JSON download."""
3746 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3748 def report_extraction(self, file_id):
3749 """Report information extraction."""
3750 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3752 def get_urls(self, jsonData, fmt, bitrate='best'):
3753 """Get urls from 'audio_formats' section in json"""
3756 bitrate_list = jsonData[fmt]
3757 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3758 bitrate = max(bitrate_list) # select highest
3760 url_list = jsonData[fmt][bitrate]
3761 except TypeError: # we have no bitrate info.
3762 url_list = jsonData[fmt]
3766 def check_urls(self, url_list):
3767 """Returns 1st active url from list"""
3768 for url in url_list:
3770 urllib2.urlopen(url)
3772 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3777 def _print_formats(self, formats):
3778 print 'Available formats:'
3779 for fmt in formats.keys():
3780 for b in formats[fmt]:
3782 ext = formats[fmt][b][0]
3783 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3784 except TypeError: # we have no bitrate info
3785 ext = formats[fmt][0]
3786 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3789 def _real_extract(self, url):
3790 mobj = re.match(self._VALID_URL, url)
3792 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3794 # extract uploader & filename from url
3795 uploader = mobj.group(1).decode('utf-8')
3796 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3798 # construct API request
3799 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3800 # retrieve .json file with links to files
3801 request = urllib2.Request(file_url)
3803 self.report_download_json(file_url)
3804 jsonData = urllib2.urlopen(request).read()
3805 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3806 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3810 json_data = json.loads(jsonData)
3811 player_url = json_data['player_swf_url']
3812 formats = dict(json_data['audio_formats'])
3814 req_format = self._downloader.params.get('format', None)
3817 if self._downloader.params.get('listformats', None):
3818 self._print_formats(formats)
3821 if req_format is None or req_format == 'best':
3822 for format_param in formats.keys():
3823 url_list = self.get_urls(formats, format_param)
3825 file_url = self.check_urls(url_list)
3826 if file_url is not None:
3829 if req_format not in formats.keys():
3830 self._downloader.trouble(u'ERROR: format is not available')
3833 url_list = self.get_urls(formats, req_format)
3834 file_url = self.check_urls(url_list)
3835 format_param = req_format
3838 self._downloader.increment_downloads()
3840 # Process file information
3841 self._downloader.process_info({
3842 'id': file_id.decode('utf-8'),
3843 'url': file_url.decode('utf-8'),
3844 'uploader': uploader.decode('utf-8'),
3845 'upload_date': u'NA',
3846 'title': json_data['name'],
3847 'stitle': _simplify_title(json_data['name']),
3848 'ext': file_url.split('.')[-1].decode('utf-8'),
3849 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3850 'thumbnail': json_data['thumbnail_url'],
3851 'description': json_data['description'],
3852 'player_url': player_url.decode('utf-8'),
3854 except UnavailableVideoError, err:
3855 self._downloader.trouble(u'ERROR: unable to download file')
3857 class StanfordOpenClassroomIE(InfoExtractor):
3858 """Information extractor for Stanford's Open ClassRoom"""
3860 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3861 IE_NAME = u'stanfordoc'
3863 def report_download_webpage(self, objid):
3864 """Report information extraction."""
3865 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3867 def report_extraction(self, video_id):
3868 """Report information extraction."""
3869 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3871 def _real_extract(self, url):
3872 mobj = re.match(self._VALID_URL, url)
3874 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3877 if mobj.group('course') and mobj.group('video'): # A specific video
3878 course = mobj.group('course')
3879 video = mobj.group('video')
3881 'id': _simplify_title(course + '_' + video),
3884 self.report_extraction(info['id'])
3885 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3886 xmlUrl = baseUrl + video + '.xml'
3888 metaXml = urllib2.urlopen(xmlUrl).read()
3889 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3890 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3892 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3894 info['title'] = mdoc.findall('./title')[0].text
3895 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3897 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3899 info['stitle'] = _simplify_title(info['title'])
3900 info['ext'] = info['url'].rpartition('.')[2]
3901 info['format'] = info['ext']
3902 self._downloader.increment_downloads()
3904 self._downloader.process_info(info)
3905 except UnavailableVideoError, err:
3906 self._downloader.trouble(u'\nERROR: unable to download video')
3907 elif mobj.group('course'): # A course page
3908 unescapeHTML = HTMLParser.HTMLParser().unescape
3910 course = mobj.group('course')
3912 'id': _simplify_title(course),
3916 self.report_download_webpage(info['id'])
3918 coursepage = urllib2.urlopen(url).read()
3919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3920 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3923 m = re.search('<h1>([^<]+)</h1>', coursepage)
3925 info['title'] = unescapeHTML(m.group(1))
3927 info['title'] = info['id']
3928 info['stitle'] = _simplify_title(info['title'])
3930 m = re.search('<description>([^<]+)</description>', coursepage)
3932 info['description'] = unescapeHTML(m.group(1))
3934 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3937 'type': 'reference',
3938 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3942 for entry in info['list']:
3943 assert entry['type'] == 'reference'
3944 self.extract(entry['url'])
3946 unescapeHTML = HTMLParser.HTMLParser().unescape
3949 'id': 'Stanford OpenClassroom',
3953 self.report_download_webpage(info['id'])
3954 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3956 rootpage = urllib2.urlopen(rootURL).read()
3957 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3958 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3961 info['title'] = info['id']
3962 info['stitle'] = _simplify_title(info['title'])
3964 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3967 'type': 'reference',
3968 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3972 for entry in info['list']:
3973 assert entry['type'] == 'reference'
3974 self.extract(entry['url'])
3976 class MTVIE(InfoExtractor):
3977 """Information extractor for MTV.com"""
3979 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3982 def report_webpage(self, video_id):
3983 """Report information extraction."""
3984 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3986 def report_extraction(self, video_id):
3987 """Report information extraction."""
3988 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3990 def _real_extract(self, url):
3991 mobj = re.match(self._VALID_URL, url)
3993 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3995 if not mobj.group('proto'):
3996 url = 'http://' + url
3997 video_id = mobj.group('videoid')
3998 self.report_webpage(video_id)
4000 request = urllib2.Request(url)
4002 webpage = urllib2.urlopen(request).read()
4003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4004 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4007 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4009 self._downloader.trouble(u'ERROR: unable to extract song name')
4011 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4012 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4014 self._downloader.trouble(u'ERROR: unable to extract performer')
4016 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4017 video_title = performer + ' - ' + song_name
4019 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4021 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4023 mtvn_uri = mobj.group(1)
4025 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4027 self._downloader.trouble(u'ERROR: unable to extract content id')
4029 content_id = mobj.group(1)
4031 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4032 self.report_extraction(video_id)
4033 request = urllib2.Request(videogen_url)
4035 metadataXml = urllib2.urlopen(request).read()
4036 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4037 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4040 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4041 renditions = mdoc.findall('.//rendition')
4043 # For now, always pick the highest quality.
4044 rendition = renditions[-1]
4047 _,_,ext = rendition.attrib['type'].partition('/')
4048 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4049 video_url = rendition.find('./src').text
4051 self._downloader.trouble('Invalid rendition field.')
4054 self._downloader.increment_downloads()
4058 'uploader': performer,
4059 'title': video_title,
4060 'stitle': _simplify_title(video_title),
4066 self._downloader.process_info(info)
4067 except UnavailableVideoError, err:
4068 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4071 class PostProcessor(object):
4072 """Post Processor class.
4074 PostProcessor objects can be added to downloaders with their
4075 add_post_processor() method. When the downloader has finished a
4076 successful download, it will take its internal chain of PostProcessors
4077 and start calling the run() method on each one of them, first with
4078 an initial argument and then with the returned value of the previous
4081 The chain will be stopped if one of them ever returns None or the end
4082 of the chain is reached.
4084 PostProcessor objects follow a "mutual registration" process similar
4085 to InfoExtractor objects.
4090 def __init__(self, downloader=None):
4091 self._downloader = downloader
4093 def set_downloader(self, downloader):
4094 """Sets the downloader for this PP."""
4095 self._downloader = downloader
4097 def run(self, information):
4098 """Run the PostProcessor.
4100 The "information" argument is a dictionary like the ones
4101 composed by InfoExtractors. The only difference is that this
4102 one has an extra field called "filepath" that points to the
4105 When this method returns None, the postprocessing chain is
4106 stopped. However, this method may return an information
4107 dictionary that will be passed to the next postprocessing
4108 object in the chain. It can be the one it received after
4109 changing some fields.
4111 In addition, this method may raise a PostProcessingError
4112 exception that will be taken into account by the downloader
4115 return information # by default, do nothing
4117 class AudioConversionError(BaseException):
4118 def __init__(self, message):
4119 self.message = message
4121 class FFmpegExtractAudioPP(PostProcessor):
4123 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4124 PostProcessor.__init__(self, downloader)
4125 if preferredcodec is None:
4126 preferredcodec = 'best'
4127 self._preferredcodec = preferredcodec
4128 self._preferredquality = preferredquality
4129 self._keepvideo = keepvideo
4132 def get_audio_codec(path):
4134 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4135 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4136 output = handle.communicate()[0]
4137 if handle.wait() != 0:
4139 except (IOError, OSError):
4142 for line in output.split('\n'):
4143 if line.startswith('codec_name='):
4144 audio_codec = line.split('=')[1].strip()
4145 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4150 def run_ffmpeg(path, out_path, codec, more_opts):
4154 acodec_opts = ['-acodec', codec]
4155 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4157 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4158 stdout,stderr = p.communicate()
4159 except (IOError, OSError):
4160 e = sys.exc_info()[1]
4161 if isinstance(e, OSError) and e.errno == 2:
4162 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4165 if p.returncode != 0:
4166 msg = stderr.strip().split('\n')[-1]
4167 raise AudioConversionError(msg)
4169 def run(self, information):
4170 path = information['filepath']
4172 filecodec = self.get_audio_codec(path)
4173 if filecodec is None:
4174 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4178 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4179 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4180 # Lossless, but in another container
4182 extension = self._preferredcodec
4183 more_opts = ['-absf', 'aac_adtstoasc']
4184 elif filecodec in ['aac', 'mp3', 'vorbis']:
4185 # Lossless if possible
4187 extension = filecodec
4188 if filecodec == 'aac':
4189 more_opts = ['-f', 'adts']
4190 if filecodec == 'vorbis':
4194 acodec = 'libmp3lame'
4197 if self._preferredquality is not None:
4198 more_opts += ['-ab', self._preferredquality]
4200 # We convert the audio (lossy)
4201 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4202 extension = self._preferredcodec
4204 if self._preferredquality is not None:
4205 more_opts += ['-ab', self._preferredquality]
4206 if self._preferredcodec == 'aac':
4207 more_opts += ['-f', 'adts']
4208 if self._preferredcodec == 'm4a':
4209 more_opts += ['-absf', 'aac_adtstoasc']
4210 if self._preferredcodec == 'vorbis':
4212 if self._preferredcodec == 'wav':
4214 more_opts += ['-f', 'wav']
4216 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4217 new_path = prefix + sep + extension
4218 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4220 self.run_ffmpeg(path, new_path, acodec, more_opts)
4222 etype,e,tb = sys.exc_info()
4223 if isinstance(e, AudioConversionError):
4224 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4226 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4229 # Try to update the date time for extracted audio file.
4230 if information.get('filetime') is not None:
4232 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4234 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4236 if not self._keepvideo:
4238 os.remove(_encodeFilename(path))
4239 except (IOError, OSError):
4240 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4243 information['filepath'] = new_path
4247 def updateSelf(downloader, filename):
4248 ''' Update the program file with the latest version from the repository '''
4249 # Note: downloader only used for options
4250 if not os.access(filename, os.W_OK):
4251 sys.exit('ERROR: no write permissions on %s' % filename)
4253 downloader.to_screen(u'Updating to latest version...')
4257 urlh = urllib.urlopen(UPDATE_URL)
4258 newcontent = urlh.read()
4260 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4261 if vmatch is not None and vmatch.group(1) == __version__:
4262 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4266 except (IOError, OSError), err:
4267 sys.exit('ERROR: unable to download latest version')
4270 outf = open(filename, 'wb')
4272 outf.write(newcontent)
4275 except (IOError, OSError), err:
4276 sys.exit('ERROR: unable to overwrite current version')
4278 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4281 def _readOptions(filename_bytes):
4283 optionf = open(filename_bytes)
4285 return [] # silently skip if file is not present
4289 res += shlex.split(l, comments=True)
4294 def _format_option_string(option):
4295 ''' ('-o', '--option') -> -o, --format METAVAR'''
4299 if option._short_opts: opts.append(option._short_opts[0])
4300 if option._long_opts: opts.append(option._long_opts[0])
4301 if len(opts) > 1: opts.insert(1, ', ')
4303 if option.takes_value(): opts.append(' %s' % option.metavar)
4305 return "".join(opts)
4307 def _find_term_columns():
4308 columns = os.environ.get('COLUMNS', None)
4313 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4314 out,err = sp.communicate()
4315 return int(out.split()[1])
4321 max_help_position = 80
4323 # No need to wrap help messages if we're on a wide console
4324 columns = _find_term_columns()
4325 if columns: max_width = columns
4327 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4328 fmt.format_option_strings = _format_option_string
4331 'version' : __version__,
4333 'usage' : '%prog [options] url [url...]',
4334 'conflict_handler' : 'resolve',
4337 parser = optparse.OptionParser(**kw)
4340 general = optparse.OptionGroup(parser, 'General Options')
4341 selection = optparse.OptionGroup(parser, 'Video Selection')
4342 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4343 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4344 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4345 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4346 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4348 general.add_option('-h', '--help',
4349 action='help', help='print this help text and exit')
4350 general.add_option('-v', '--version',
4351 action='version', help='print program version and exit')
4352 general.add_option('-U', '--update',
4353 action='store_true', dest='update_self', help='update this program to latest version')
4354 general.add_option('-i', '--ignore-errors',
4355 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4356 general.add_option('-r', '--rate-limit',
4357 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4358 general.add_option('-R', '--retries',
4359 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4360 general.add_option('--dump-user-agent',
4361 action='store_true', dest='dump_user_agent',
4362 help='display the current browser identification', default=False)
4363 general.add_option('--list-extractors',
4364 action='store_true', dest='list_extractors',
4365 help='List all supported extractors and the URLs they would handle', default=False)
4367 selection.add_option('--playlist-start',
4368 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4369 selection.add_option('--playlist-end',
4370 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4371 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4372 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4373 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4375 authentication.add_option('-u', '--username',
4376 dest='username', metavar='USERNAME', help='account username')
4377 authentication.add_option('-p', '--password',
4378 dest='password', metavar='PASSWORD', help='account password')
4379 authentication.add_option('-n', '--netrc',
4380 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4383 video_format.add_option('-f', '--format',
4384 action='store', dest='format', metavar='FORMAT', help='video format code')
4385 video_format.add_option('--all-formats',
4386 action='store_const', dest='format', help='download all available video formats', const='all')
4387 video_format.add_option('--prefer-free-formats',
4388 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4389 video_format.add_option('--max-quality',
4390 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4391 video_format.add_option('-F', '--list-formats',
4392 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4395 verbosity.add_option('-q', '--quiet',
4396 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4397 verbosity.add_option('-s', '--simulate',
4398 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4399 verbosity.add_option('--skip-download',
4400 action='store_true', dest='skip_download', help='do not download the video', default=False)
4401 verbosity.add_option('-g', '--get-url',
4402 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4403 verbosity.add_option('-e', '--get-title',
4404 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4405 verbosity.add_option('--get-thumbnail',
4406 action='store_true', dest='getthumbnail',
4407 help='simulate, quiet but print thumbnail URL', default=False)
4408 verbosity.add_option('--get-description',
4409 action='store_true', dest='getdescription',
4410 help='simulate, quiet but print video description', default=False)
4411 verbosity.add_option('--get-filename',
4412 action='store_true', dest='getfilename',
4413 help='simulate, quiet but print output filename', default=False)
4414 verbosity.add_option('--get-format',
4415 action='store_true', dest='getformat',
4416 help='simulate, quiet but print output format', default=False)
4417 verbosity.add_option('--no-progress',
4418 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4419 verbosity.add_option('--console-title',
4420 action='store_true', dest='consoletitle',
4421 help='display progress in console titlebar', default=False)
4422 verbosity.add_option('-v', '--verbose',
4423 action='store_true', dest='verbose', help='print various debugging information', default=False)
4426 filesystem.add_option('-t', '--title',
4427 action='store_true', dest='usetitle', help='use title in file name', default=False)
4428 filesystem.add_option('-l', '--literal',
4429 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4430 filesystem.add_option('-A', '--auto-number',
4431 action='store_true', dest='autonumber',
4432 help='number downloaded files starting from 00000', default=False)
4433 filesystem.add_option('-o', '--output',
4434 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4435 filesystem.add_option('-a', '--batch-file',
4436 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4437 filesystem.add_option('-w', '--no-overwrites',
4438 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4439 filesystem.add_option('-c', '--continue',
4440 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4441 filesystem.add_option('--no-continue',
4442 action='store_false', dest='continue_dl',
4443 help='do not resume partially downloaded files (restart from beginning)')
4444 filesystem.add_option('--cookies',
4445 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4446 filesystem.add_option('--no-part',
4447 action='store_true', dest='nopart', help='do not use .part files', default=False)
4448 filesystem.add_option('--no-mtime',
4449 action='store_false', dest='updatetime',
4450 help='do not use the Last-modified header to set the file modification time', default=True)
4451 filesystem.add_option('--write-description',
4452 action='store_true', dest='writedescription',
4453 help='write video description to a .description file', default=False)
4454 filesystem.add_option('--write-info-json',
4455 action='store_true', dest='writeinfojson',
4456 help='write video metadata to a .info.json file', default=False)
4457 filesystem.add_option('--write-srt',
4458 action='store_true', dest='writesubtitles',
4459 help='write video subtitles to a .srt file', default=False)
4462 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4463 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4464 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4465 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4466 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4467 help='ffmpeg audio bitrate specification, 128k by default')
4468 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4469 help='keeps the video file on disk after the post-processing; the video is erased by default')
4472 parser.add_option_group(general)
4473 parser.add_option_group(selection)
4474 parser.add_option_group(filesystem)
4475 parser.add_option_group(verbosity)
4476 parser.add_option_group(video_format)
4477 parser.add_option_group(authentication)
4478 parser.add_option_group(postproc)
4480 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4482 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4484 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4485 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4486 opts, args = parser.parse_args(argv)
4488 return parser, opts, args
4490 def gen_extractors():
4491 """ Return a list of an instance of every supported extractor.
4492 The order does matter; the first extractor matched is the one handling the URL.
4494 youtube_ie = YoutubeIE()
4495 google_ie = GoogleIE()
4496 yahoo_ie = YahooIE()
4498 YoutubePlaylistIE(youtube_ie),
4499 YoutubeUserIE(youtube_ie),
4500 YoutubeSearchIE(youtube_ie),
4502 MetacafeIE(youtube_ie),
4505 GoogleSearchIE(google_ie),
4508 YahooSearchIE(yahoo_ie),
4521 StanfordOpenClassroomIE(),
4528 parser, opts, args = parseOpts()
4530 # Open appropriate CookieJar
4531 if opts.cookiefile is None:
4532 jar = cookielib.CookieJar()
4535 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4536 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4538 except (IOError, OSError), err:
4539 sys.exit(u'ERROR: unable to open cookie file')
4542 if opts.dump_user_agent:
4543 print std_headers['User-Agent']
4546 # Batch file verification
4548 if opts.batchfile is not None:
4550 if opts.batchfile == '-':
4553 batchfd = open(opts.batchfile, 'r')
4554 batchurls = batchfd.readlines()
4555 batchurls = [x.strip() for x in batchurls]
4556 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4558 sys.exit(u'ERROR: batch file could not be read')
4559 all_urls = batchurls + args
4561 # General configuration
4562 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4563 proxy_handler = urllib2.ProxyHandler()
4564 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4565 urllib2.install_opener(opener)
4566 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4569 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4571 extractors = gen_extractors()
4573 if opts.list_extractors:
4574 for ie in extractors:
4576 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4577 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4578 for mu in matchedUrls:
4582 # Conflicting, missing and erroneous options
4583 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4584 parser.error(u'using .netrc conflicts with giving username/password')
4585 if opts.password is not None and opts.username is None:
4586 parser.error(u'account username missing')
4587 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4588 parser.error(u'using output template conflicts with using title, literal title or auto number')
4589 if opts.usetitle and opts.useliteral:
4590 parser.error(u'using title conflicts with using literal title')
4591 if opts.username is not None and opts.password is None:
4592 opts.password = getpass.getpass(u'Type account password and press return:')
4593 if opts.ratelimit is not None:
4594 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4595 if numeric_limit is None:
4596 parser.error(u'invalid rate limit specified')
4597 opts.ratelimit = numeric_limit
4598 if opts.retries is not None:
4600 opts.retries = long(opts.retries)
4601 except (TypeError, ValueError), err:
4602 parser.error(u'invalid retry count specified')
4604 opts.playliststart = int(opts.playliststart)
4605 if opts.playliststart <= 0:
4606 raise ValueError(u'Playlist start must be positive')
4607 except (TypeError, ValueError), err:
4608 parser.error(u'invalid playlist start number specified')
4610 opts.playlistend = int(opts.playlistend)
4611 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4612 raise ValueError(u'Playlist end must be greater than playlist start')
4613 except (TypeError, ValueError), err:
4614 parser.error(u'invalid playlist end number specified')
4615 if opts.extractaudio:
4616 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4617 parser.error(u'invalid audio format specified')
4620 fd = FileDownloader({
4621 'usenetrc': opts.usenetrc,
4622 'username': opts.username,
4623 'password': opts.password,
4624 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4625 'forceurl': opts.geturl,
4626 'forcetitle': opts.gettitle,
4627 'forcethumbnail': opts.getthumbnail,
4628 'forcedescription': opts.getdescription,
4629 'forcefilename': opts.getfilename,
4630 'forceformat': opts.getformat,
4631 'simulate': opts.simulate,
4632 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4633 'format': opts.format,
4634 'format_limit': opts.format_limit,
4635 'listformats': opts.listformats,
4636 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4637 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4638 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4639 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4640 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4641 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4642 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4643 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4644 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4645 or u'%(id)s.%(ext)s'),
4646 'ignoreerrors': opts.ignoreerrors,
4647 'ratelimit': opts.ratelimit,
4648 'nooverwrites': opts.nooverwrites,
4649 'retries': opts.retries,
4650 'continuedl': opts.continue_dl,
4651 'noprogress': opts.noprogress,
4652 'playliststart': opts.playliststart,
4653 'playlistend': opts.playlistend,
4654 'logtostderr': opts.outtmpl == '-',
4655 'consoletitle': opts.consoletitle,
4656 'nopart': opts.nopart,
4657 'updatetime': opts.updatetime,
4658 'writedescription': opts.writedescription,
4659 'writeinfojson': opts.writeinfojson,
4660 'writesubtitles': opts.writesubtitles,
4661 'matchtitle': opts.matchtitle,
4662 'rejecttitle': opts.rejecttitle,
4663 'max_downloads': opts.max_downloads,
4664 'prefer_free_formats': opts.prefer_free_formats,
4665 'verbose': opts.verbose,
4667 for extractor in extractors:
4668 fd.add_info_extractor(extractor)
4671 if opts.extractaudio:
4672 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4675 if opts.update_self:
4676 updateSelf(fd, sys.argv[0])
4679 if len(all_urls) < 1:
4680 if not opts.update_self:
4681 parser.error(u'you must provide at least one URL')
4686 retcode = fd.download(all_urls)
4687 except MaxDownloadsReached:
4688 fd.to_screen(u'--max-download limit reached, aborting.')
4691 # Dump cookie jar if requested
4692 if opts.cookiefile is not None:
4695 except (IOError, OSError), err:
4696 sys.exit(u'ERROR: unable to save cookie jar')
4703 except DownloadError:
4705 except SameFileError:
4706 sys.exit(u'ERROR: fixed output name but more than one file to download')
4707 except KeyboardInterrupt:
4708 sys.exit(u'\nERROR: Interrupted by user')
4710 if __name__ == '__main__':
4713 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: