2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.15'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
634 def report_writedescription(self, descfn):
635 """ Report that the description file is being written """
636 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638 def report_writeinfojson(self, infofn):
639 """ Report that the metadata file has been written """
640 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
642 def report_destination(self, filename):
643 """Report destination filename."""
644 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
646 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
647 """Report download progress."""
648 if self.params.get('noprogress', False):
650 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
651 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
652 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
653 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
655 def report_resuming_byte(self, resume_len):
656 """Report attempt to resume at given byte."""
657 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
659 def report_retry(self, count, retries):
660 """Report retry in case of HTTP error 5xx"""
661 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
663 def report_file_already_downloaded(self, file_name):
664 """Report file has already been fully downloaded."""
666 self.to_screen(u'[download] %s has already been downloaded' % file_name)
667 except (UnicodeEncodeError), err:
668 self.to_screen(u'[download] The file has already been downloaded')
670 def report_unable_to_resume(self):
671 """Report it was impossible to resume download."""
672 self.to_screen(u'[download] Unable to resume')
674 def report_finish(self):
675 """Report download finished."""
676 if self.params.get('noprogress', False):
677 self.to_screen(u'[download] Download completed')
681 def increment_downloads(self):
682 """Increment the ordinal that assigns a number to each file."""
683 self._num_downloads += 1
685 def prepare_filename(self, info_dict):
686 """Generate the output filename."""
688 template_dict = dict(info_dict)
689 template_dict['epoch'] = unicode(long(time.time()))
690 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
691 filename = self.params['outtmpl'] % template_dict
693 except (ValueError, KeyError), err:
694 self.trouble(u'ERROR: invalid system charset or erroneous output template')
697 def process_info(self, info_dict):
698 """Process a single dictionary returned by an InfoExtractor."""
699 filename = self.prepare_filename(info_dict)
700 # Do nothing else if in simulate mode
701 if self.params.get('simulate', False):
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
719 matchtitle=self.params.get('matchtitle',False)
720 rejecttitle=self.params.get('rejecttitle',False)
721 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
722 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
723 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
725 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
726 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
729 if self.params.get('nooverwrites', False) and os.path.exists(filename):
730 self.to_stderr(u'WARNING: file exists and will be skipped')
734 dn = os.path.dirname(filename)
735 if dn != '' and not os.path.exists(dn):
737 except (OSError, IOError), err:
738 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
741 if self.params.get('writedescription', False):
743 descfn = filename + '.description'
744 self.report_writedescription(descfn)
745 descfile = open(descfn, 'wb')
747 descfile.write(info_dict['description'].encode('utf-8'))
750 except (OSError, IOError):
751 self.trouble(u'ERROR: Cannot write description file ' + descfn)
754 if self.params.get('writeinfojson', False):
755 infofn = filename + '.info.json'
756 self.report_writeinfojson(infofn)
759 except (NameError,AttributeError):
760 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
763 infof = open(infofn, 'wb')
765 json.dump(info_dict, infof)
768 except (OSError, IOError):
769 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
773 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
774 except (OSError, IOError), err:
775 raise UnavailableVideoError
776 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
777 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
779 except (ContentTooShortError, ), err:
780 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
785 self.post_process(filename, info_dict)
786 except (PostProcessingError), err:
787 self.trouble(u'ERROR: postprocessing: %s' % str(err))
790 def download(self, url_list):
791 """Download a given list of URLs."""
792 if len(url_list) > 1 and self.fixed_template():
793 raise SameFileError(self.params['outtmpl'])
796 suitable_found = False
798 # Go to next InfoExtractor if not suitable
799 if not ie.suitable(url):
802 # Suitable InfoExtractor found
803 suitable_found = True
805 # Extract information from URL and process it
808 # Suitable InfoExtractor had been found; go to next URL
811 if not suitable_found:
812 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
814 return self._download_retcode
816 def post_process(self, filename, ie_info):
817 """Run the postprocessing chain on the given file."""
819 info['filepath'] = filename
825 def _download_with_rtmpdump(self, filename, url, player_url):
826 self.report_destination(filename)
827 tmpfilename = self.temp_name(filename)
829 # Check for rtmpdump first
831 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
832 except (OSError, IOError):
833 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
836 # Download using rtmpdump. rtmpdump returns exit code 2 when
837 # the connection was interrumpted and resuming appears to be
838 # possible. This is part of rtmpdump's normal usage, AFAIK.
839 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
840 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
841 while retval == 2 or retval == 1:
842 prevsize = os.path.getsize(tmpfilename)
843 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
844 time.sleep(5.0) # This seems to be needed
845 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
846 cursize = os.path.getsize(tmpfilename)
847 if prevsize == cursize and retval == 1:
849 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
850 if prevsize == cursize and retval == 2 and cursize > 1024:
851 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
855 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
856 self.try_rename(tmpfilename, filename)
859 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
862 def _do_download(self, filename, url, player_url):
863 # Check file already present
864 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
865 self.report_file_already_downloaded(filename)
868 # Attempt to download using rtmpdump
869 if url.startswith('rtmp'):
870 return self._download_with_rtmpdump(filename, url, player_url)
872 tmpfilename = self.temp_name(filename)
876 # Do not include the Accept-Encoding header
877 headers = {'Youtubedl-no-compression': 'True'}
878 basic_request = urllib2.Request(url, None, headers)
879 request = urllib2.Request(url, None, headers)
881 # Establish possible resume length
882 if os.path.isfile(tmpfilename):
883 resume_len = os.path.getsize(tmpfilename)
887 # Request parameters in case of being able to resume
888 if self.params.get('continuedl', False) and resume_len != 0:
889 self.report_resuming_byte(resume_len)
890 request.add_header('Range', 'bytes=%d-' % resume_len)
894 retries = self.params.get('retries', 0)
895 while count <= retries:
896 # Establish connection
898 data = urllib2.urlopen(request)
900 except (urllib2.HTTPError, ), err:
901 if (err.code < 500 or err.code >= 600) and err.code != 416:
902 # Unexpected HTTP error
904 elif err.code == 416:
905 # Unable to resume (requested range not satisfiable)
907 # Open the connection again without the range header
908 data = urllib2.urlopen(basic_request)
909 content_length = data.info()['Content-Length']
910 except (urllib2.HTTPError, ), err:
911 if err.code < 500 or err.code >= 600:
914 # Examine the reported length
915 if (content_length is not None and
916 (resume_len - 100 < long(content_length) < resume_len + 100)):
917 # The file had already been fully downloaded.
918 # Explanation to the above condition: in issue #175 it was revealed that
919 # YouTube sometimes adds or removes a few bytes from the end of the file,
920 # changing the file size slightly and causing problems for some users. So
921 # I decided to implement a suggested change and consider the file
922 # completely downloaded if the file size differs less than 100 bytes from
923 # the one in the hard drive.
924 self.report_file_already_downloaded(filename)
925 self.try_rename(tmpfilename, filename)
928 # The length does not match, we start the download over
929 self.report_unable_to_resume()
935 self.report_retry(count, retries)
938 self.trouble(u'ERROR: giving up after %s retries' % retries)
941 data_len = data.info().get('Content-length', None)
942 if data_len is not None:
943 data_len = long(data_len) + resume_len
944 data_len_str = self.format_bytes(data_len)
945 byte_counter = 0 + resume_len
951 data_block = data.read(block_size)
953 if len(data_block) == 0:
955 byte_counter += len(data_block)
957 # Open file just in time
960 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
961 assert stream is not None
962 filename = self.undo_temp_name(tmpfilename)
963 self.report_destination(filename)
964 except (OSError, IOError), err:
965 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
968 stream.write(data_block)
969 except (IOError, OSError), err:
970 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
972 block_size = self.best_block_size(after - before, len(data_block))
975 percent_str = self.calc_percent(byte_counter, data_len)
976 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
977 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
978 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
981 self.slow_down(start, byte_counter - resume_len)
984 self.trouble(u'\nERROR: Did not get any data blocks')
988 if data_len is not None and byte_counter != data_len:
989 raise ContentTooShortError(byte_counter, long(data_len))
990 self.try_rename(tmpfilename, filename)
992 # Update file modification time
993 if self.params.get('updatetime', True):
994 self.try_utime(filename, data.info().get('last-modified', None))
999 class InfoExtractor(object):
1000 """Information Extractor class.
1002 Information extractors are the classes that, given a URL, extract
1003 information from the video (or videos) the URL refers to. This
1004 information includes the real video URL, the video title and simplified
1005 title, author and others. The information is stored in a dictionary
1006 which is then passed to the FileDownloader. The FileDownloader
1007 processes this information possibly downloading the video to the file
1008 system, among other possible outcomes. The dictionaries must include
1009 the following fields:
1011 id: Video identifier.
1012 url: Final video URL.
1013 uploader: Nickname of the video uploader.
1014 title: Literal title.
1015 stitle: Simplified title.
1016 ext: Video filename extension.
1017 format: Video format.
1018 player_url: SWF Player URL (may be None).
1020 The following fields are optional. Their primary purpose is to allow
1021 youtube-dl to serve as the backend for a video search function, such
1022 as the one in youtube2mp3. They are only used when their respective
1023 forced printing functions are called:
1025 thumbnail: Full URL to a video thumbnail image.
1026 description: One-line video description.
1028 Subclasses of this one should re-define the _real_initialize() and
1029 _real_extract() methods and define a _VALID_URL regexp.
1030 Probably, they should also be added to the list of extractors.
1036 def __init__(self, downloader=None):
1037 """Constructor. Receives an optional downloader."""
1039 self.set_downloader(downloader)
1041 def suitable(self, url):
1042 """Receives a URL and returns True if suitable for this IE."""
1043 return re.match(self._VALID_URL, url) is not None
1045 def initialize(self):
1046 """Initializes an instance (authentication, etc)."""
1048 self._real_initialize()
1051 def extract(self, url):
1052 """Extracts URL information and returns it in list of dicts."""
1054 return self._real_extract(url)
1056 def set_downloader(self, downloader):
1057 """Sets the downloader for this IE."""
1058 self._downloader = downloader
1060 def _real_initialize(self):
1061 """Real initialization process. Redefine in subclasses."""
1064 def _real_extract(self, url):
1065 """Real extraction process. Redefine in subclasses."""
1069 class YoutubeIE(InfoExtractor):
1070 """Information extractor for youtube.com."""
1072 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1073 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1074 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1075 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1076 _NETRC_MACHINE = 'youtube'
1077 # Listed in order of quality
1078 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1079 _video_extensions = {
1085 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1090 def report_lang(self):
1091 """Report attempt to set language."""
1092 self._downloader.to_screen(u'[youtube] Setting language')
1094 def report_login(self):
1095 """Report attempt to log in."""
1096 self._downloader.to_screen(u'[youtube] Logging in')
1098 def report_age_confirmation(self):
1099 """Report attempt to confirm age."""
1100 self._downloader.to_screen(u'[youtube] Confirming age')
1102 def report_video_webpage_download(self, video_id):
1103 """Report attempt to download video webpage."""
1104 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1106 def report_video_info_webpage_download(self, video_id):
1107 """Report attempt to download video info webpage."""
1108 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1110 def report_information_extraction(self, video_id):
1111 """Report attempt to extract video information."""
1112 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1114 def report_unavailable_format(self, video_id, format):
1115 """Report extracted video URL."""
1116 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1118 def report_rtmp_download(self):
1119 """Indicate the download will use the RTMP protocol."""
1120 self._downloader.to_screen(u'[youtube] RTMP download detected')
1122 def _real_initialize(self):
1123 if self._downloader is None:
1128 downloader_params = self._downloader.params
1130 # Attempt to use provided username and password or .netrc data
1131 if downloader_params.get('username', None) is not None:
1132 username = downloader_params['username']
1133 password = downloader_params['password']
1134 elif downloader_params.get('usenetrc', False):
1136 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1137 if info is not None:
1141 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1142 except (IOError, netrc.NetrcParseError), err:
1143 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1147 request = urllib2.Request(self._LANG_URL)
1150 urllib2.urlopen(request).read()
1151 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1152 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1155 # No authentication to be performed
1156 if username is None:
1161 'current_form': 'loginForm',
1163 'action_login': 'Log In',
1164 'username': username,
1165 'password': password,
1167 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1170 login_results = urllib2.urlopen(request).read()
1171 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1172 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1174 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1175 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1181 'action_confirm': 'Confirm',
1183 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1185 self.report_age_confirmation()
1186 age_results = urllib2.urlopen(request).read()
1187 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1188 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1191 def _real_extract(self, url):
1192 # Extract video id from URL
1193 mobj = re.match(self._VALID_URL, url)
1195 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1197 video_id = mobj.group(2)
1200 self.report_video_webpage_download(video_id)
1201 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1203 video_webpage = urllib2.urlopen(request).read()
1204 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1205 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1208 # Attempt to extract SWF player URL
1209 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1210 if mobj is not None:
1211 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1216 self.report_video_info_webpage_download(video_id)
1217 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1218 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1219 % (video_id, el_type))
1220 request = urllib2.Request(video_info_url)
1222 video_info_webpage = urllib2.urlopen(request).read()
1223 video_info = parse_qs(video_info_webpage)
1224 if 'token' in video_info:
1226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1227 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1229 if 'token' not in video_info:
1230 if 'reason' in video_info:
1231 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1233 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1236 # Start extracting information
1237 self.report_information_extraction(video_id)
1240 if 'author' not in video_info:
1241 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1243 video_uploader = urllib.unquote_plus(video_info['author'][0])
1246 if 'title' not in video_info:
1247 self._downloader.trouble(u'ERROR: unable to extract video title')
1249 video_title = urllib.unquote_plus(video_info['title'][0])
1250 video_title = video_title.decode('utf-8')
1251 video_title = sanitize_title(video_title)
1254 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1255 simple_title = simple_title.strip(ur'_')
1258 if 'thumbnail_url' not in video_info:
1259 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1260 video_thumbnail = ''
1261 else: # don't panic if we can't find it
1262 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1266 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1267 if mobj is not None:
1268 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1269 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1270 for expression in format_expressions:
1272 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1280 video_description = u'No description available.'
1281 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1282 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1283 if mobj is not None:
1284 video_description = mobj.group(1).decode('utf-8')
1286 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1287 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1288 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1289 # TODO use another parser
1292 video_token = urllib.unquote_plus(video_info['token'][0])
1294 # Decide which formats to download
1295 req_format = self._downloader.params.get('format', None)
1297 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1298 self.report_rtmp_download()
1299 video_url_list = [(None, video_info['conn'][0])]
1300 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1301 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1302 url_data = [parse_qs(uds) for uds in url_data_strs]
1303 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1304 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1306 format_limit = self._downloader.params.get('format_limit', None)
1307 if format_limit is not None and format_limit in self._available_formats:
1308 format_list = self._available_formats[self._available_formats.index(format_limit):]
1310 format_list = self._available_formats
1311 existing_formats = [x for x in format_list if x in url_map]
1312 if len(existing_formats) == 0:
1313 self._downloader.trouble(u'ERROR: no known formats available for video')
1315 if req_format is None:
1316 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1317 elif req_format == '-1':
1318 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1321 if req_format not in url_map:
1322 self._downloader.trouble(u'ERROR: requested format not available')
1324 video_url_list = [(req_format, url_map[req_format])] # Specific format
1326 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1329 for format_param, video_real_url in video_url_list:
1330 # At this point we have a new video
1331 self._downloader.increment_downloads()
1334 video_extension = self._video_extensions.get(format_param, 'flv')
1337 # Process video information
1338 self._downloader.process_info({
1339 'id': video_id.decode('utf-8'),
1340 'url': video_real_url.decode('utf-8'),
1341 'uploader': video_uploader.decode('utf-8'),
1342 'upload_date': upload_date,
1343 'title': video_title,
1344 'stitle': simple_title,
1345 'ext': video_extension.decode('utf-8'),
1346 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1347 'thumbnail': video_thumbnail.decode('utf-8'),
1348 'description': video_description,
1349 'player_url': player_url,
1351 except UnavailableVideoError, err:
1352 self._downloader.trouble(u'\nERROR: unable to download video')
1355 class MetacafeIE(InfoExtractor):
1356 """Information Extractor for metacafe.com."""
1358 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1359 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1360 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1363 def __init__(self, youtube_ie, downloader=None):
1364 InfoExtractor.__init__(self, downloader)
1365 self._youtube_ie = youtube_ie
1367 def report_disclaimer(self):
1368 """Report disclaimer retrieval."""
1369 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1371 def report_age_confirmation(self):
1372 """Report attempt to confirm age."""
1373 self._downloader.to_screen(u'[metacafe] Confirming age')
1375 def report_download_webpage(self, video_id):
1376 """Report webpage download."""
1377 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1379 def report_extraction(self, video_id):
1380 """Report information extraction."""
1381 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1383 def _real_initialize(self):
1384 # Retrieve disclaimer
1385 request = urllib2.Request(self._DISCLAIMER)
1387 self.report_disclaimer()
1388 disclaimer = urllib2.urlopen(request).read()
1389 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1390 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1396 'submit': "Continue - I'm over 18",
1398 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1400 self.report_age_confirmation()
1401 disclaimer = urllib2.urlopen(request).read()
1402 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1403 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1406 def _real_extract(self, url):
1407 # Extract id and simplified title from URL
1408 mobj = re.match(self._VALID_URL, url)
1410 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1413 video_id = mobj.group(1)
1415 # Check if video comes from YouTube
1416 mobj2 = re.match(r'^yt-(.*)$', video_id)
1417 if mobj2 is not None:
1418 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1421 # At this point we have a new video
1422 self._downloader.increment_downloads()
1424 simple_title = mobj.group(2).decode('utf-8')
1426 # Retrieve video webpage to extract further information
1427 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1429 self.report_download_webpage(video_id)
1430 webpage = urllib2.urlopen(request).read()
1431 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1432 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1435 # Extract URL, uploader and title from webpage
1436 self.report_extraction(video_id)
1437 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1438 if mobj is not None:
1439 mediaURL = urllib.unquote(mobj.group(1))
1440 video_extension = mediaURL[-3:]
1442 # Extract gdaKey if available
1443 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1445 video_url = mediaURL
1447 gdaKey = mobj.group(1)
1448 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1450 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1452 self._downloader.trouble(u'ERROR: unable to extract media URL')
1454 vardict = parse_qs(mobj.group(1))
1455 if 'mediaData' not in vardict:
1456 self._downloader.trouble(u'ERROR: unable to extract media URL')
1458 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1460 self._downloader.trouble(u'ERROR: unable to extract media URL')
1462 mediaURL = mobj.group(1).replace('\\/', '/')
1463 video_extension = mediaURL[-3:]
1464 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1466 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1468 self._downloader.trouble(u'ERROR: unable to extract title')
1470 video_title = mobj.group(1).decode('utf-8')
1471 video_title = sanitize_title(video_title)
1473 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1475 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1477 video_uploader = mobj.group(1)
1480 # Process video information
1481 self._downloader.process_info({
1482 'id': video_id.decode('utf-8'),
1483 'url': video_url.decode('utf-8'),
1484 'uploader': video_uploader.decode('utf-8'),
1485 'upload_date': u'NA',
1486 'title': video_title,
1487 'stitle': simple_title,
1488 'ext': video_extension.decode('utf-8'),
1492 except UnavailableVideoError:
1493 self._downloader.trouble(u'\nERROR: unable to download video')
1496 class DailymotionIE(InfoExtractor):
1497 """Information Extractor for Dailymotion"""
1499 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1501 def __init__(self, downloader=None):
1502 InfoExtractor.__init__(self, downloader)
1504 def report_download_webpage(self, video_id):
1505 """Report webpage download."""
1506 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1508 def report_extraction(self, video_id):
1509 """Report information extraction."""
1510 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1512 def _real_initialize(self):
1515 def _real_extract(self, url):
1516 # Extract id and simplified title from URL
1517 mobj = re.match(self._VALID_URL, url)
1519 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1522 # At this point we have a new video
1523 self._downloader.increment_downloads()
1524 video_id = mobj.group(1)
1526 simple_title = mobj.group(2).decode('utf-8')
1527 video_extension = 'flv'
1529 # Retrieve video webpage to extract further information
1530 request = urllib2.Request(url)
1531 request.add_header('Cookie', 'family_filter=off')
1533 self.report_download_webpage(video_id)
1534 webpage = urllib2.urlopen(request).read()
1535 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1536 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1539 # Extract URL, uploader and title from webpage
1540 self.report_extraction(video_id)
1541 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1543 self._downloader.trouble(u'ERROR: unable to extract media URL')
1545 sequence = urllib.unquote(mobj.group(1))
1546 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1548 self._downloader.trouble(u'ERROR: unable to extract media URL')
1550 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1552 # if needed add http://www.dailymotion.com/ if relative URL
1554 video_url = mediaURL
1556 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1558 self._downloader.trouble(u'ERROR: unable to extract title')
1560 video_title = mobj.group(1).decode('utf-8')
1561 video_title = sanitize_title(video_title)
1563 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1565 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1567 video_uploader = mobj.group(1)
1570 # Process video information
1571 self._downloader.process_info({
1572 'id': video_id.decode('utf-8'),
1573 'url': video_url.decode('utf-8'),
1574 'uploader': video_uploader.decode('utf-8'),
1575 'upload_date': u'NA',
1576 'title': video_title,
1577 'stitle': simple_title,
1578 'ext': video_extension.decode('utf-8'),
1582 except UnavailableVideoError:
1583 self._downloader.trouble(u'\nERROR: unable to download video')
1586 class GoogleIE(InfoExtractor):
1587 """Information extractor for video.google.com."""
1589 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1591 def __init__(self, downloader=None):
1592 InfoExtractor.__init__(self, downloader)
1594 def report_download_webpage(self, video_id):
1595 """Report webpage download."""
1596 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1598 def report_extraction(self, video_id):
1599 """Report information extraction."""
1600 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1602 def _real_initialize(self):
1605 def _real_extract(self, url):
1606 # Extract id from URL
1607 mobj = re.match(self._VALID_URL, url)
1609 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1612 # At this point we have a new video
1613 self._downloader.increment_downloads()
1614 video_id = mobj.group(1)
1616 video_extension = 'mp4'
1618 # Retrieve video webpage to extract further information
1619 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1621 self.report_download_webpage(video_id)
1622 webpage = urllib2.urlopen(request).read()
1623 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1624 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1627 # Extract URL, uploader, and title from webpage
1628 self.report_extraction(video_id)
1629 mobj = re.search(r"download_url:'([^']+)'", webpage)
1631 video_extension = 'flv'
1632 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1634 self._downloader.trouble(u'ERROR: unable to extract media URL')
1636 mediaURL = urllib.unquote(mobj.group(1))
1637 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1638 mediaURL = mediaURL.replace('\\x26', '\x26')
1640 video_url = mediaURL
1642 mobj = re.search(r'<title>(.*)</title>', webpage)
1644 self._downloader.trouble(u'ERROR: unable to extract title')
1646 video_title = mobj.group(1).decode('utf-8')
1647 video_title = sanitize_title(video_title)
1648 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1650 # Extract video description
1651 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1653 self._downloader.trouble(u'ERROR: unable to extract video description')
1655 video_description = mobj.group(1).decode('utf-8')
1656 if not video_description:
1657 video_description = 'No description available.'
1659 # Extract video thumbnail
1660 if self._downloader.params.get('forcethumbnail', False):
1661 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1663 webpage = urllib2.urlopen(request).read()
1664 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1665 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1667 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1669 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1671 video_thumbnail = mobj.group(1)
1672 else: # we need something to pass to process_info
1673 video_thumbnail = ''
1676 # Process video information
1677 self._downloader.process_info({
1678 'id': video_id.decode('utf-8'),
1679 'url': video_url.decode('utf-8'),
1681 'upload_date': u'NA',
1682 'title': video_title,
1683 'stitle': simple_title,
1684 'ext': video_extension.decode('utf-8'),
1688 except UnavailableVideoError:
1689 self._downloader.trouble(u'\nERROR: unable to download video')
1692 class PhotobucketIE(InfoExtractor):
1693 """Information extractor for photobucket.com."""
1695 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1697 def __init__(self, downloader=None):
1698 InfoExtractor.__init__(self, downloader)
1700 def report_download_webpage(self, video_id):
1701 """Report webpage download."""
1702 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1704 def report_extraction(self, video_id):
1705 """Report information extraction."""
1706 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1708 def _real_initialize(self):
1711 def _real_extract(self, url):
1712 # Extract id from URL
1713 mobj = re.match(self._VALID_URL, url)
1715 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1718 # At this point we have a new video
1719 self._downloader.increment_downloads()
1720 video_id = mobj.group(1)
1722 video_extension = 'flv'
1724 # Retrieve video webpage to extract further information
1725 request = urllib2.Request(url)
1727 self.report_download_webpage(video_id)
1728 webpage = urllib2.urlopen(request).read()
1729 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1730 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1733 # Extract URL, uploader, and title from webpage
1734 self.report_extraction(video_id)
1735 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1737 self._downloader.trouble(u'ERROR: unable to extract media URL')
1739 mediaURL = urllib.unquote(mobj.group(1))
1741 video_url = mediaURL
1743 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1745 self._downloader.trouble(u'ERROR: unable to extract title')
1747 video_title = mobj.group(1).decode('utf-8')
1748 video_title = sanitize_title(video_title)
1749 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1751 video_uploader = mobj.group(2).decode('utf-8')
1754 # Process video information
1755 self._downloader.process_info({
1756 'id': video_id.decode('utf-8'),
1757 'url': video_url.decode('utf-8'),
1758 'uploader': video_uploader,
1759 'upload_date': u'NA',
1760 'title': video_title,
1761 'stitle': simple_title,
1762 'ext': video_extension.decode('utf-8'),
1766 except UnavailableVideoError:
1767 self._downloader.trouble(u'\nERROR: unable to download video')
1770 class YahooIE(InfoExtractor):
1771 """Information extractor for video.yahoo.com."""
1773 # _VALID_URL matches all Yahoo! Video URLs
1774 # _VPAGE_URL matches only the extractable '/watch/' URLs
1775 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1776 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1778 def __init__(self, downloader=None):
1779 InfoExtractor.__init__(self, downloader)
1781 def report_download_webpage(self, video_id):
1782 """Report webpage download."""
1783 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1785 def report_extraction(self, video_id):
1786 """Report information extraction."""
1787 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1789 def _real_initialize(self):
1792 def _real_extract(self, url, new_video=True):
1793 # Extract ID from URL
1794 mobj = re.match(self._VALID_URL, url)
1796 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1799 # At this point we have a new video
1800 self._downloader.increment_downloads()
1801 video_id = mobj.group(2)
1802 video_extension = 'flv'
1804 # Rewrite valid but non-extractable URLs as
1805 # extractable English language /watch/ URLs
1806 if re.match(self._VPAGE_URL, url) is None:
1807 request = urllib2.Request(url)
1809 webpage = urllib2.urlopen(request).read()
1810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1811 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1814 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1816 self._downloader.trouble(u'ERROR: Unable to extract id field')
1818 yahoo_id = mobj.group(1)
1820 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1822 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1824 yahoo_vid = mobj.group(1)
1826 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1827 return self._real_extract(url, new_video=False)
1829 # Retrieve video webpage to extract further information
1830 request = urllib2.Request(url)
1832 self.report_download_webpage(video_id)
1833 webpage = urllib2.urlopen(request).read()
1834 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1835 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1838 # Extract uploader and title from webpage
1839 self.report_extraction(video_id)
1840 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1842 self._downloader.trouble(u'ERROR: unable to extract video title')
1844 video_title = mobj.group(1).decode('utf-8')
1845 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1847 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1849 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1851 video_uploader = mobj.group(1).decode('utf-8')
1853 # Extract video thumbnail
1854 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1856 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1858 video_thumbnail = mobj.group(1).decode('utf-8')
1860 # Extract video description
1861 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1863 self._downloader.trouble(u'ERROR: unable to extract video description')
1865 video_description = mobj.group(1).decode('utf-8')
1866 if not video_description:
1867 video_description = 'No description available.'
1869 # Extract video height and width
1870 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1872 self._downloader.trouble(u'ERROR: unable to extract video height')
1874 yv_video_height = mobj.group(1)
1876 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1878 self._downloader.trouble(u'ERROR: unable to extract video width')
1880 yv_video_width = mobj.group(1)
1882 # Retrieve video playlist to extract media URL
1883 # I'm not completely sure what all these options are, but we
1884 # seem to need most of them, otherwise the server sends a 401.
1885 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1886 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1887 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1888 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1889 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1891 self.report_download_webpage(video_id)
1892 webpage = urllib2.urlopen(request).read()
1893 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1894 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1897 # Extract media URL from playlist XML
1898 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1900 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1902 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1903 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1906 # Process video information
1907 self._downloader.process_info({
1908 'id': video_id.decode('utf-8'),
1910 'uploader': video_uploader,
1911 'upload_date': u'NA',
1912 'title': video_title,
1913 'stitle': simple_title,
1914 'ext': video_extension.decode('utf-8'),
1915 'thumbnail': video_thumbnail.decode('utf-8'),
1916 'description': video_description,
1917 'thumbnail': video_thumbnail,
1920 except UnavailableVideoError:
1921 self._downloader.trouble(u'\nERROR: unable to download video')
1924 class VimeoIE(InfoExtractor):
1925 """Information extractor for vimeo.com."""
1927 # _VALID_URL matches Vimeo URLs
1928 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1930 def __init__(self, downloader=None):
1931 InfoExtractor.__init__(self, downloader)
1933 def report_download_webpage(self, video_id):
1934 """Report webpage download."""
1935 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1937 def report_extraction(self, video_id):
1938 """Report information extraction."""
1939 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1941 def _real_initialize(self):
1944 def _real_extract(self, url, new_video=True):
1945 # Extract ID from URL
1946 mobj = re.match(self._VALID_URL, url)
1948 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1951 # At this point we have a new video
1952 self._downloader.increment_downloads()
1953 video_id = mobj.group(1)
1955 # Retrieve video webpage to extract further information
1956 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1958 self.report_download_webpage(video_id)
1959 webpage = urllib2.urlopen(request).read()
1960 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1961 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1964 # Now we begin extracting as much information as we can from what we
1965 # retrieved. First we extract the information common to all extractors,
1966 # and latter we extract those that are Vimeo specific.
1967 self.report_extraction(video_id)
1970 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1972 self._downloader.trouble(u'ERROR: unable to extract video title')
1974 video_title = mobj.group(1).decode('utf-8')
1975 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1978 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1980 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1982 video_uploader = mobj.group(1).decode('utf-8')
1984 # Extract video thumbnail
1985 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1987 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1989 video_thumbnail = mobj.group(1).decode('utf-8')
1991 # # Extract video description
1992 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1994 # self._downloader.trouble(u'ERROR: unable to extract video description')
1996 # video_description = mobj.group(1).decode('utf-8')
1997 # if not video_description: video_description = 'No description available.'
1998 video_description = 'Foo.'
2000 # Vimeo specific: extract request signature
2001 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2003 self._downloader.trouble(u'ERROR: unable to extract request signature')
2005 sig = mobj.group(1).decode('utf-8')
2007 # Vimeo specific: Extract request signature expiration
2008 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2010 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2012 sig_exp = mobj.group(1).decode('utf-8')
2014 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2017 # Process video information
2018 self._downloader.process_info({
2019 'id': video_id.decode('utf-8'),
2021 'uploader': video_uploader,
2022 'upload_date': u'NA',
2023 'title': video_title,
2024 'stitle': simple_title,
2026 'thumbnail': video_thumbnail.decode('utf-8'),
2027 'description': video_description,
2028 'thumbnail': video_thumbnail,
2029 'description': video_description,
2032 except UnavailableVideoError:
2033 self._downloader.trouble(u'ERROR: unable to download video')
2036 class GenericIE(InfoExtractor):
2037 """Generic last-resort information extractor."""
2041 def __init__(self, downloader=None):
2042 InfoExtractor.__init__(self, downloader)
2044 def report_download_webpage(self, video_id):
2045 """Report webpage download."""
2046 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2047 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2049 def report_extraction(self, video_id):
2050 """Report information extraction."""
2051 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2053 def _real_initialize(self):
2056 def _real_extract(self, url):
2057 # At this point we have a new video
2058 self._downloader.increment_downloads()
2060 video_id = url.split('/')[-1]
2061 request = urllib2.Request(url)
2063 self.report_download_webpage(video_id)
2064 webpage = urllib2.urlopen(request).read()
2065 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2066 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2068 except ValueError, err:
2069 # since this is the last-resort InfoExtractor, if
2070 # this error is thrown, it'll be thrown here
2071 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2074 self.report_extraction(video_id)
2075 # Start with something easy: JW Player in SWFObject
2076 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2078 # Broaden the search a little bit
2079 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2081 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2084 # It's possible that one of the regexes
2085 # matched, but returned an empty group:
2086 if mobj.group(1) is None:
2087 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2090 video_url = urllib.unquote(mobj.group(1))
2091 video_id = os.path.basename(video_url)
2093 # here's a fun little line of code for you:
2094 video_extension = os.path.splitext(video_id)[1][1:]
2095 video_id = os.path.splitext(video_id)[0]
2097 # it's tempting to parse this further, but you would
2098 # have to take into account all the variations like
2099 # Video Title - Site Name
2100 # Site Name | Video Title
2101 # Video Title - Tagline | Site Name
2102 # and so on and so forth; it's just not practical
2103 mobj = re.search(r'<title>(.*)</title>', webpage)
2105 self._downloader.trouble(u'ERROR: unable to extract title')
2107 video_title = mobj.group(1).decode('utf-8')
2108 video_title = sanitize_title(video_title)
2109 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2111 # video uploader is domain name
2112 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2114 self._downloader.trouble(u'ERROR: unable to extract title')
2116 video_uploader = mobj.group(1).decode('utf-8')
2119 # Process video information
2120 self._downloader.process_info({
2121 'id': video_id.decode('utf-8'),
2122 'url': video_url.decode('utf-8'),
2123 'uploader': video_uploader,
2124 'upload_date': u'NA',
2125 'title': video_title,
2126 'stitle': simple_title,
2127 'ext': video_extension.decode('utf-8'),
2131 except UnavailableVideoError, err:
2132 self._downloader.trouble(u'\nERROR: unable to download video')
2135 class YoutubeSearchIE(InfoExtractor):
2136 """Information Extractor for YouTube search queries."""
2137 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2138 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2139 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2140 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2142 _max_youtube_results = 1000
2144 def __init__(self, youtube_ie, downloader=None):
2145 InfoExtractor.__init__(self, downloader)
2146 self._youtube_ie = youtube_ie
2148 def report_download_page(self, query, pagenum):
2149 """Report attempt to download playlist page with given number."""
2150 query = query.decode(preferredencoding())
2151 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2153 def _real_initialize(self):
2154 self._youtube_ie.initialize()
2156 def _real_extract(self, query):
2157 mobj = re.match(self._VALID_URL, query)
2159 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2162 prefix, query = query.split(':')
2164 query = query.encode('utf-8')
2166 self._download_n_results(query, 1)
2168 elif prefix == 'all':
2169 self._download_n_results(query, self._max_youtube_results)
2175 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2177 elif n > self._max_youtube_results:
2178 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2179 n = self._max_youtube_results
2180 self._download_n_results(query, n)
2182 except ValueError: # parsing prefix as integer fails
2183 self._download_n_results(query, 1)
2186 def _download_n_results(self, query, n):
2187 """Downloads a specified number of results for a query"""
2190 already_seen = set()
2194 self.report_download_page(query, pagenum)
2195 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2196 request = urllib2.Request(result_url)
2198 page = urllib2.urlopen(request).read()
2199 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2200 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2203 # Extract video identifiers
2204 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2205 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2206 if video_id not in already_seen:
2207 video_ids.append(video_id)
2208 already_seen.add(video_id)
2209 if len(video_ids) == n:
2210 # Specified n videos reached
2211 for id in video_ids:
2212 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2215 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2216 for id in video_ids:
2217 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2220 pagenum = pagenum + 1
2223 class GoogleSearchIE(InfoExtractor):
2224 """Information Extractor for Google Video search queries."""
2225 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2226 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2227 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2228 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2230 _max_google_results = 1000
2232 def __init__(self, google_ie, downloader=None):
2233 InfoExtractor.__init__(self, downloader)
2234 self._google_ie = google_ie
2236 def report_download_page(self, query, pagenum):
2237 """Report attempt to download playlist page with given number."""
2238 query = query.decode(preferredencoding())
2239 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2241 def _real_initialize(self):
2242 self._google_ie.initialize()
2244 def _real_extract(self, query):
2245 mobj = re.match(self._VALID_URL, query)
2247 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2250 prefix, query = query.split(':')
2252 query = query.encode('utf-8')
2254 self._download_n_results(query, 1)
2256 elif prefix == 'all':
2257 self._download_n_results(query, self._max_google_results)
2263 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2265 elif n > self._max_google_results:
2266 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2267 n = self._max_google_results
2268 self._download_n_results(query, n)
2270 except ValueError: # parsing prefix as integer fails
2271 self._download_n_results(query, 1)
2274 def _download_n_results(self, query, n):
2275 """Downloads a specified number of results for a query"""
2278 already_seen = set()
2282 self.report_download_page(query, pagenum)
2283 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2284 request = urllib2.Request(result_url)
2286 page = urllib2.urlopen(request).read()
2287 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2288 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2291 # Extract video identifiers
2292 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2293 video_id = mobj.group(1)
2294 if video_id not in already_seen:
2295 video_ids.append(video_id)
2296 already_seen.add(video_id)
2297 if len(video_ids) == n:
2298 # Specified n videos reached
2299 for id in video_ids:
2300 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2303 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2304 for id in video_ids:
2305 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2308 pagenum = pagenum + 1
2311 class YahooSearchIE(InfoExtractor):
2312 """Information Extractor for Yahoo! Video search queries."""
2313 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2314 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2315 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2316 _MORE_PAGES_INDICATOR = r'\s*Next'
2318 _max_yahoo_results = 1000
2320 def __init__(self, yahoo_ie, downloader=None):
2321 InfoExtractor.__init__(self, downloader)
2322 self._yahoo_ie = yahoo_ie
2324 def report_download_page(self, query, pagenum):
2325 """Report attempt to download playlist page with given number."""
2326 query = query.decode(preferredencoding())
2327 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2329 def _real_initialize(self):
2330 self._yahoo_ie.initialize()
2332 def _real_extract(self, query):
2333 mobj = re.match(self._VALID_URL, query)
2335 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2338 prefix, query = query.split(':')
2340 query = query.encode('utf-8')
2342 self._download_n_results(query, 1)
2344 elif prefix == 'all':
2345 self._download_n_results(query, self._max_yahoo_results)
2351 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2353 elif n > self._max_yahoo_results:
2354 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2355 n = self._max_yahoo_results
2356 self._download_n_results(query, n)
2358 except ValueError: # parsing prefix as integer fails
2359 self._download_n_results(query, 1)
2362 def _download_n_results(self, query, n):
2363 """Downloads a specified number of results for a query"""
2366 already_seen = set()
2370 self.report_download_page(query, pagenum)
2371 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2372 request = urllib2.Request(result_url)
2374 page = urllib2.urlopen(request).read()
2375 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2376 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2379 # Extract video identifiers
2380 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2381 video_id = mobj.group(1)
2382 if video_id not in already_seen:
2383 video_ids.append(video_id)
2384 already_seen.add(video_id)
2385 if len(video_ids) == n:
2386 # Specified n videos reached
2387 for id in video_ids:
2388 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2391 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2392 for id in video_ids:
2393 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2396 pagenum = pagenum + 1
2399 class YoutubePlaylistIE(InfoExtractor):
2400 """Information Extractor for YouTube playlists."""
2402 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2403 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2404 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2405 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2408 def __init__(self, youtube_ie, downloader=None):
2409 InfoExtractor.__init__(self, downloader)
2410 self._youtube_ie = youtube_ie
2412 def report_download_page(self, playlist_id, pagenum):
2413 """Report attempt to download playlist page with given number."""
2414 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2416 def _real_initialize(self):
2417 self._youtube_ie.initialize()
2419 def _real_extract(self, url):
2420 # Extract playlist id
2421 mobj = re.match(self._VALID_URL, url)
2423 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2427 if mobj.group(3) is not None:
2428 self._youtube_ie.extract(mobj.group(3))
2431 # Download playlist pages
2432 # prefix is 'p' as default for playlists but there are other types that need extra care
2433 playlist_prefix = mobj.group(1)
2434 if playlist_prefix == 'a':
2435 playlist_access = 'artist'
2437 playlist_prefix = 'p'
2438 playlist_access = 'view_play_list'
2439 playlist_id = mobj.group(2)
2444 self.report_download_page(playlist_id, pagenum)
2445 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2447 page = urllib2.urlopen(request).read()
2448 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2449 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2452 # Extract video identifiers
2454 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2455 if mobj.group(1) not in ids_in_page:
2456 ids_in_page.append(mobj.group(1))
2457 video_ids.extend(ids_in_page)
2459 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2461 pagenum = pagenum + 1
2463 playliststart = self._downloader.params.get('playliststart', 1) - 1
2464 playlistend = self._downloader.params.get('playlistend', -1)
2465 video_ids = video_ids[playliststart:playlistend]
2467 for id in video_ids:
2468 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2472 class YoutubeUserIE(InfoExtractor):
2473 """Information Extractor for YouTube users."""
2475 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2476 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2477 _GDATA_PAGE_SIZE = 50
2478 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2479 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2482 def __init__(self, youtube_ie, downloader=None):
2483 InfoExtractor.__init__(self, downloader)
2484 self._youtube_ie = youtube_ie
2486 def report_download_page(self, username, start_index):
2487 """Report attempt to download user page."""
2488 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2489 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2491 def _real_initialize(self):
2492 self._youtube_ie.initialize()
2494 def _real_extract(self, url):
2496 mobj = re.match(self._VALID_URL, url)
2498 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2501 username = mobj.group(1)
2503 # Download video ids using YouTube Data API. Result size per
2504 # query is limited (currently to 50 videos) so we need to query
2505 # page by page until there are no video ids - it means we got
2512 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2513 self.report_download_page(username, start_index)
2515 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2518 page = urllib2.urlopen(request).read()
2519 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2520 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2523 # Extract video identifiers
2526 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2527 if mobj.group(1) not in ids_in_page:
2528 ids_in_page.append(mobj.group(1))
2530 video_ids.extend(ids_in_page)
2532 # A little optimization - if current page is not
2533 # "full", ie. does not contain PAGE_SIZE video ids then
2534 # we can assume that this page is the last one - there
2535 # are no more ids on further pages - no need to query
2538 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2543 all_ids_count = len(video_ids)
2544 playliststart = self._downloader.params.get('playliststart', 1) - 1
2545 playlistend = self._downloader.params.get('playlistend', -1)
2547 if playlistend == -1:
2548 video_ids = video_ids[playliststart:]
2550 video_ids = video_ids[playliststart:playlistend]
2552 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2553 (username, all_ids_count, len(video_ids)))
2555 for video_id in video_ids:
2556 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2559 class DepositFilesIE(InfoExtractor):
2560 """Information extractor for depositfiles.com"""
2562 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2564 def __init__(self, downloader=None):
2565 InfoExtractor.__init__(self, downloader)
2567 def report_download_webpage(self, file_id):
2568 """Report webpage download."""
2569 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2571 def report_extraction(self, file_id):
2572 """Report information extraction."""
2573 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2575 def _real_initialize(self):
2578 def _real_extract(self, url):
2579 # At this point we have a new file
2580 self._downloader.increment_downloads()
2582 file_id = url.split('/')[-1]
2583 # Rebuild url in english locale
2584 url = 'http://depositfiles.com/en/files/' + file_id
2586 # Retrieve file webpage with 'Free download' button pressed
2587 free_download_indication = { 'gateway_result' : '1' }
2588 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2590 self.report_download_webpage(file_id)
2591 webpage = urllib2.urlopen(request).read()
2592 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2593 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2596 # Search for the real file URL
2597 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2598 if (mobj is None) or (mobj.group(1) is None):
2599 # Try to figure out reason of the error.
2600 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2601 if (mobj is not None) and (mobj.group(1) is not None):
2602 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2603 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2605 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2608 file_url = mobj.group(1)
2609 file_extension = os.path.splitext(file_url)[1][1:]
2611 # Search for file title
2612 mobj = re.search(r'<b title="(.*?)">', webpage)
2614 self._downloader.trouble(u'ERROR: unable to extract title')
2616 file_title = mobj.group(1).decode('utf-8')
2619 # Process file information
2620 self._downloader.process_info({
2621 'id': file_id.decode('utf-8'),
2622 'url': file_url.decode('utf-8'),
2624 'upload_date': u'NA',
2625 'title': file_title,
2626 'stitle': file_title,
2627 'ext': file_extension.decode('utf-8'),
2631 except UnavailableVideoError, err:
2632 self._downloader.trouble(u'ERROR: unable to download file')
2635 class FacebookIE(InfoExtractor):
2636 """Information Extractor for Facebook"""
2638 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2639 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2640 _NETRC_MACHINE = 'facebook'
2641 _available_formats = ['highqual', 'lowqual']
2642 _video_extensions = {
2647 def __init__(self, downloader=None):
2648 InfoExtractor.__init__(self, downloader)
2650 def _reporter(self, message):
2651 """Add header and report message."""
2652 self._downloader.to_screen(u'[facebook] %s' % message)
2654 def report_login(self):
2655 """Report attempt to log in."""
2656 self._reporter(u'Logging in')
2658 def report_video_webpage_download(self, video_id):
2659 """Report attempt to download video webpage."""
2660 self._reporter(u'%s: Downloading video webpage' % video_id)
2662 def report_information_extraction(self, video_id):
2663 """Report attempt to extract video information."""
2664 self._reporter(u'%s: Extracting video information' % video_id)
2666 def _parse_page(self, video_webpage):
2667 """Extract video information from page"""
2669 data = {'title': r'class="video_title datawrap">(.*?)</',
2670 'description': r'<div class="datawrap">(.*?)</div>',
2671 'owner': r'\("video_owner_name", "(.*?)"\)',
2672 'upload_date': r'data-date="(.*?)"',
2673 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2676 for piece in data.keys():
2677 mobj = re.search(data[piece], video_webpage)
2678 if mobj is not None:
2679 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2683 for fmt in self._available_formats:
2684 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2685 if mobj is not None:
2686 # URL is in a Javascript segment inside an escaped Unicode format within
2687 # the generally utf-8 page
2688 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2689 video_info['video_urls'] = video_urls
2693 def _real_initialize(self):
2694 if self._downloader is None:
2699 downloader_params = self._downloader.params
2701 # Attempt to use provided username and password or .netrc data
2702 if downloader_params.get('username', None) is not None:
2703 useremail = downloader_params['username']
2704 password = downloader_params['password']
2705 elif downloader_params.get('usenetrc', False):
2707 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2708 if info is not None:
2712 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2713 except (IOError, netrc.NetrcParseError), err:
2714 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2717 if useremail is None:
2726 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2729 login_results = urllib2.urlopen(request).read()
2730 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2731 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2733 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2734 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2737 def _real_extract(self, url):
2738 mobj = re.match(self._VALID_URL, url)
2740 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2742 video_id = mobj.group('ID')
2745 self.report_video_webpage_download(video_id)
2746 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2748 page = urllib2.urlopen(request)
2749 video_webpage = page.read()
2750 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2751 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2754 # Start extracting information
2755 self.report_information_extraction(video_id)
2757 # Extract information
2758 video_info = self._parse_page(video_webpage)
2761 if 'owner' not in video_info:
2762 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2764 video_uploader = video_info['owner']
2767 if 'title' not in video_info:
2768 self._downloader.trouble(u'ERROR: unable to extract video title')
2770 video_title = video_info['title']
2771 video_title = video_title.decode('utf-8')
2772 video_title = sanitize_title(video_title)
2775 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2776 simple_title = simple_title.strip(ur'_')
2779 if 'thumbnail' not in video_info:
2780 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2781 video_thumbnail = ''
2783 video_thumbnail = video_info['thumbnail']
2787 if 'upload_date' in video_info:
2788 upload_time = video_info['upload_date']
2789 timetuple = email.utils.parsedate_tz(upload_time)
2790 if timetuple is not None:
2792 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2797 video_description = video_info.get('description', 'No description available.')
2799 url_map = video_info['video_urls']
2800 if len(url_map.keys()) > 0:
2801 # Decide which formats to download
2802 req_format = self._downloader.params.get('format', None)
2803 format_limit = self._downloader.params.get('format_limit', None)
2805 if format_limit is not None and format_limit in self._available_formats:
2806 format_list = self._available_formats[self._available_formats.index(format_limit):]
2808 format_list = self._available_formats
2809 existing_formats = [x for x in format_list if x in url_map]
2810 if len(existing_formats) == 0:
2811 self._downloader.trouble(u'ERROR: no known formats available for video')
2813 if req_format is None:
2814 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2815 elif req_format == '-1':
2816 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2819 if req_format not in url_map:
2820 self._downloader.trouble(u'ERROR: requested format not available')
2822 video_url_list = [(req_format, url_map[req_format])] # Specific format
2824 for format_param, video_real_url in video_url_list:
2826 # At this point we have a new video
2827 self._downloader.increment_downloads()
2830 video_extension = self._video_extensions.get(format_param, 'mp4')
2833 # Process video information
2834 self._downloader.process_info({
2835 'id': video_id.decode('utf-8'),
2836 'url': video_real_url.decode('utf-8'),
2837 'uploader': video_uploader.decode('utf-8'),
2838 'upload_date': upload_date,
2839 'title': video_title,
2840 'stitle': simple_title,
2841 'ext': video_extension.decode('utf-8'),
2842 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2843 'thumbnail': video_thumbnail.decode('utf-8'),
2844 'description': video_description.decode('utf-8'),
2847 except UnavailableVideoError, err:
2848 self._downloader.trouble(u'\nERROR: unable to download video')
2850 class BlipTVIE(InfoExtractor):
2851 """Information extractor for blip.tv"""
2853 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2854 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2856 def report_extraction(self, file_id):
2857 """Report information extraction."""
2858 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2860 def _simplify_title(self, title):
2861 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2862 res = res.strip(ur'_')
2865 def _real_extract(self, url):
2866 mobj = re.match(self._VALID_URL, url)
2868 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2875 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2876 request = urllib2.Request(json_url)
2877 self.report_extraction(mobj.group(1))
2879 json_code = urllib2.urlopen(request).read()
2880 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2881 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2884 json_data = json.loads(json_code)
2885 if 'Post' in json_data:
2886 data = json_data['Post']
2890 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2891 video_url = data['media']['url']
2892 umobj = re.match(self._URL_EXT, video_url)
2894 raise ValueError('Can not determine filename extension')
2895 ext = umobj.group(1)
2897 self._downloader.increment_downloads()
2900 'id': data['item_id'],
2902 'uploader': data['display_name'],
2903 'upload_date': upload_date,
2904 'title': data['title'],
2905 'stitle': self._simplify_title(data['title']),
2907 'format': data['media']['mimeType'],
2908 'thumbnail': data['thumbnailUrl'],
2909 'description': data['description'],
2910 'player_url': data['embedUrl']
2912 except (ValueError,KeyError), err:
2913 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2917 self._downloader.process_info(info)
2918 except UnavailableVideoError, err:
2919 self._downloader.trouble(u'\nERROR: unable to download video')
2922 class MyVideoIE(InfoExtractor):
2923 """Information Extractor for myvideo.de."""
2925 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2927 def __init__(self, downloader=None):
2928 InfoExtractor.__init__(self, downloader)
2930 def report_download_webpage(self, video_id):
2931 """Report webpage download."""
2932 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2934 def report_extraction(self, video_id):
2935 """Report information extraction."""
2936 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2938 def _real_initialize(self):
2941 def _real_extract(self,url):
2942 mobj = re.match(self._VALID_URL, url)
2944 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2947 video_id = mobj.group(1)
2948 simple_title = mobj.group(2).decode('utf-8')
2949 # should actually not be necessary
2950 simple_title = sanitize_title(simple_title)
2951 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2954 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2956 self.report_download_webpage(video_id)
2957 webpage = urllib2.urlopen(request).read()
2958 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2959 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2962 self.report_extraction(video_id)
2963 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2966 self._downloader.trouble(u'ERROR: unable to extract media URL')
2968 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2970 mobj = re.search('<title>([^<]+)</title>', webpage)
2972 self._downloader.trouble(u'ERROR: unable to extract title')
2975 video_title = mobj.group(1)
2976 video_title = sanitize_title(video_title)
2980 self._downloader.process_info({
2984 'upload_date': u'NA',
2985 'title': video_title,
2986 'stitle': simple_title,
2991 except UnavailableVideoError:
2992 self._downloader.trouble(u'\nERROR: Unable to download video')
2994 class ComedyCentralIE(InfoExtractor):
2995 """Information extractor for The Daily Show and Colbert Report """
2997 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2999 def report_extraction(self, episode_id):
3000 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3002 def report_config_download(self, episode_id):
3003 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3005 def report_index_download(self, episode_id):
3006 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3008 def report_player_url(self, episode_id):
3009 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3011 def _simplify_title(self, title):
3012 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3013 res = res.strip(ur'_')
3016 def _real_extract(self, url):
3017 mobj = re.match(self._VALID_URL, url)
3019 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3022 if mobj.group('shortname'):
3023 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3024 url = 'http://www.thedailyshow.com/full-episodes/'
3026 url = 'http://www.colbertnation.com/full-episodes/'
3027 mobj = re.match(self._VALID_URL, url)
3028 assert mobj is not None
3030 dlNewest = not mobj.group('episode')
3032 epTitle = mobj.group('showname')
3034 epTitle = mobj.group('episode')
3036 req = urllib2.Request(url)
3037 self.report_extraction(epTitle)
3039 htmlHandle = urllib2.urlopen(req)
3040 html = htmlHandle.read()
3041 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3042 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3045 url = htmlHandle.geturl()
3046 mobj = re.match(self._VALID_URL, url)
3048 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3050 if mobj.group('episode') == '':
3051 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3053 epTitle = mobj.group('episode')
3055 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3056 if len(mMovieParams) == 0:
3057 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3060 playerUrl_raw = mMovieParams[0][0]
3061 self.report_player_url(epTitle)
3063 urlHandle = urllib2.urlopen(playerUrl_raw)
3064 playerUrl = urlHandle.geturl()
3065 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3066 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3069 uri = mMovieParams[0][1]
3070 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3071 self.report_index_download(epTitle)
3073 indexXml = urllib2.urlopen(indexUrl).read()
3074 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3075 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3078 idoc = xml.etree.ElementTree.fromstring(indexXml)
3079 itemEls = idoc.findall('.//item')
3080 for itemEl in itemEls:
3081 mediaId = itemEl.findall('./guid')[0].text
3082 shortMediaId = mediaId.split(':')[-1]
3083 showId = mediaId.split(':')[-2].replace('.com', '')
3084 officialTitle = itemEl.findall('./title')[0].text
3085 officialDate = itemEl.findall('./pubDate')[0].text
3087 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3088 urllib.urlencode({'uri': mediaId}))
3089 configReq = urllib2.Request(configUrl)
3090 self.report_config_download(epTitle)
3092 configXml = urllib2.urlopen(configReq).read()
3093 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3094 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3097 cdoc = xml.etree.ElementTree.fromstring(configXml)
3099 for rendition in cdoc.findall('.//rendition'):
3100 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3104 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3107 # For now, just pick the highest bitrate
3108 format,video_url = turls[-1]
3110 self._downloader.increment_downloads()
3112 effTitle = showId + '-' + epTitle
3117 'upload_date': officialDate,
3119 'stitle': self._simplify_title(effTitle),
3123 'description': officialTitle,
3124 'player_url': playerUrl
3128 self._downloader.process_info(info)
3129 except UnavailableVideoError, err:
3130 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3134 class EscapistIE(InfoExtractor):
3135 """Information extractor for The Escapist """
3137 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3139 def report_extraction(self, showName):
3140 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3142 def report_config_download(self, showName):
3143 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3145 def _simplify_title(self, title):
3146 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3147 res = res.strip(ur'_')
3150 def _real_extract(self, url):
3151 htmlParser = HTMLParser.HTMLParser()
3153 mobj = re.match(self._VALID_URL, url)
3155 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3157 showName = mobj.group('showname')
3158 videoId = mobj.group('episode')
3160 self.report_extraction(showName)
3162 webPage = urllib2.urlopen(url).read()
3163 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3164 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3167 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3168 description = htmlParser.unescape(descMatch.group(1))
3169 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3170 imgUrl = htmlParser.unescape(imgMatch.group(1))
3171 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3172 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3173 configUrlMatch = re.search('config=(.*)$', playerUrl)
3174 configUrl = urllib2.unquote(configUrlMatch.group(1))
3176 self.report_config_download(showName)
3178 configJSON = urllib2.urlopen(configUrl).read()
3179 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3180 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3183 # Technically, it's JavaScript, not JSON
3184 configJSON = configJSON.replace("'", '"')
3187 config = json.loads(configJSON)
3188 except (ValueError,), err:
3189 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3192 playlist = config['playlist']
3193 videoUrl = playlist[1]['url']
3195 self._downloader.increment_downloads()
3199 'uploader': showName,
3200 'upload_date': None,
3202 'stitle': self._simplify_title(showName),
3205 'thumbnail': imgUrl,
3206 'description': description,
3207 'player_url': playerUrl,
3211 self._downloader.process_info(info)
3212 except UnavailableVideoError, err:
3213 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3217 class PostProcessor(object):
3218 """Post Processor class.
3220 PostProcessor objects can be added to downloaders with their
3221 add_post_processor() method. When the downloader has finished a
3222 successful download, it will take its internal chain of PostProcessors
3223 and start calling the run() method on each one of them, first with
3224 an initial argument and then with the returned value of the previous
3227 The chain will be stopped if one of them ever returns None or the end
3228 of the chain is reached.
3230 PostProcessor objects follow a "mutual registration" process similar
3231 to InfoExtractor objects.
3236 def __init__(self, downloader=None):
3237 self._downloader = downloader
3239 def set_downloader(self, downloader):
3240 """Sets the downloader for this PP."""
3241 self._downloader = downloader
3243 def run(self, information):
3244 """Run the PostProcessor.
3246 The "information" argument is a dictionary like the ones
3247 composed by InfoExtractors. The only difference is that this
3248 one has an extra field called "filepath" that points to the
3251 When this method returns None, the postprocessing chain is
3252 stopped. However, this method may return an information
3253 dictionary that will be passed to the next postprocessing
3254 object in the chain. It can be the one it received after
3255 changing some fields.
3257 In addition, this method may raise a PostProcessingError
3258 exception that will be taken into account by the downloader
3261 return information # by default, do nothing
3264 class FFmpegExtractAudioPP(PostProcessor):
3266 def __init__(self, downloader=None, preferredcodec=None):
3267 PostProcessor.__init__(self, downloader)
3268 if preferredcodec is None:
3269 preferredcodec = 'best'
3270 self._preferredcodec = preferredcodec
3273 def get_audio_codec(path):
3275 cmd = ['ffprobe', '-show_streams', '--', path]
3276 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3277 output = handle.communicate()[0]
3278 if handle.wait() != 0:
3280 except (IOError, OSError):
3283 for line in output.split('\n'):
3284 if line.startswith('codec_name='):
3285 audio_codec = line.split('=')[1].strip()
3286 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3291 def run_ffmpeg(path, out_path, codec, more_opts):
3293 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3294 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3296 except (IOError, OSError):
3299 def run(self, information):
3300 path = information['filepath']
3302 filecodec = self.get_audio_codec(path)
3303 if filecodec is None:
3304 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3308 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3309 if filecodec == 'aac' or filecodec == 'mp3':
3310 # Lossless if possible
3312 extension = filecodec
3313 if filecodec == 'aac':
3314 more_opts = ['-f', 'adts']
3317 acodec = 'libmp3lame'
3319 more_opts = ['-ab', '128k']
3321 # We convert the audio (lossy)
3322 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3323 extension = self._preferredcodec
3324 more_opts = ['-ab', '128k']
3325 if self._preferredcodec == 'aac':
3326 more_opts += ['-f', 'adts']
3328 (prefix, ext) = os.path.splitext(path)
3329 new_path = prefix + '.' + extension
3330 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3331 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3334 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3339 except (IOError, OSError):
3340 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3343 information['filepath'] = new_path
3347 def updateSelf(downloader, filename):
3348 ''' Update the program file with the latest version from the repository '''
3349 # Note: downloader only used for options
3350 if not os.access(filename, os.W_OK):
3351 sys.exit('ERROR: no write permissions on %s' % filename)
3353 downloader.to_screen('Updating to latest version...')
3357 urlh = urllib.urlopen(UPDATE_URL)
3358 newcontent = urlh.read()
3361 except (IOError, OSError), err:
3362 sys.exit('ERROR: unable to download latest version')
3365 outf = open(filename, 'wb')
3367 outf.write(newcontent)
3370 except (IOError, OSError), err:
3371 sys.exit('ERROR: unable to overwrite current version')
3373 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3380 def _format_option_string(option):
3381 ''' ('-o', '--option') -> -o, --format METAVAR'''
3385 if option._short_opts: opts.append(option._short_opts[0])
3386 if option._long_opts: opts.append(option._long_opts[0])
3387 if len(opts) > 1: opts.insert(1, ', ')
3389 if option.takes_value(): opts.append(' %s' % option.metavar)
3391 return "".join(opts)
3393 def _find_term_columns():
3394 columns = os.environ.get('COLUMNS', None)
3399 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3400 out,err = sp.communicate()
3401 return int(out.split()[1])
3407 max_help_position = 80
3409 # No need to wrap help messages if we're on a wide console
3410 columns = _find_term_columns()
3411 if columns: max_width = columns
3413 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3414 fmt.format_option_strings = _format_option_string
3417 'version' : __version__,
3419 'usage' : '%prog [options] url [url...]',
3420 'conflict_handler' : 'resolve',
3423 parser = optparse.OptionParser(**kw)
3426 general = optparse.OptionGroup(parser, 'General Options')
3427 selection = optparse.OptionGroup(parser, 'Video Selection')
3428 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3429 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3430 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3431 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3432 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3434 general.add_option('-h', '--help',
3435 action='help', help='print this help text and exit')
3436 general.add_option('-v', '--version',
3437 action='version', help='print program version and exit')
3438 general.add_option('-U', '--update',
3439 action='store_true', dest='update_self', help='update this program to latest version')
3440 general.add_option('-i', '--ignore-errors',
3441 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3442 general.add_option('-r', '--rate-limit',
3443 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3444 general.add_option('-R', '--retries',
3445 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3446 general.add_option('--dump-user-agent',
3447 action='store_true', dest='dump_user_agent',
3448 help='display the current browser identification', default=False)
3450 selection.add_option('--playlist-start',
3451 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3452 selection.add_option('--playlist-end',
3453 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3454 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3455 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3457 authentication.add_option('-u', '--username',
3458 dest='username', metavar='USERNAME', help='account username')
3459 authentication.add_option('-p', '--password',
3460 dest='password', metavar='PASSWORD', help='account password')
3461 authentication.add_option('-n', '--netrc',
3462 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3465 video_format.add_option('-f', '--format',
3466 action='store', dest='format', metavar='FORMAT', help='video format code')
3467 video_format.add_option('--all-formats',
3468 action='store_const', dest='format', help='download all available video formats', const='-1')
3469 video_format.add_option('--max-quality',
3470 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3473 verbosity.add_option('-q', '--quiet',
3474 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3475 verbosity.add_option('-s', '--simulate',
3476 action='store_true', dest='simulate', help='do not download video', default=False)
3477 verbosity.add_option('-g', '--get-url',
3478 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3479 verbosity.add_option('-e', '--get-title',
3480 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3481 verbosity.add_option('--get-thumbnail',
3482 action='store_true', dest='getthumbnail',
3483 help='simulate, quiet but print thumbnail URL', default=False)
3484 verbosity.add_option('--get-description',
3485 action='store_true', dest='getdescription',
3486 help='simulate, quiet but print video description', default=False)
3487 verbosity.add_option('--get-filename',
3488 action='store_true', dest='getfilename',
3489 help='simulate, quiet but print output filename', default=False)
3490 verbosity.add_option('--no-progress',
3491 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3492 verbosity.add_option('--console-title',
3493 action='store_true', dest='consoletitle',
3494 help='display progress in console titlebar', default=False)
3497 filesystem.add_option('-t', '--title',
3498 action='store_true', dest='usetitle', help='use title in file name', default=False)
3499 filesystem.add_option('-l', '--literal',
3500 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3501 filesystem.add_option('-A', '--auto-number',
3502 action='store_true', dest='autonumber',
3503 help='number downloaded files starting from 00000', default=False)
3504 filesystem.add_option('-o', '--output',
3505 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3506 filesystem.add_option('-a', '--batch-file',
3507 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3508 filesystem.add_option('-w', '--no-overwrites',
3509 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3510 filesystem.add_option('-c', '--continue',
3511 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3512 filesystem.add_option('--cookies',
3513 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3514 filesystem.add_option('--no-part',
3515 action='store_true', dest='nopart', help='do not use .part files', default=False)
3516 filesystem.add_option('--no-mtime',
3517 action='store_false', dest='updatetime',
3518 help='do not use the Last-modified header to set the file modification time', default=True)
3519 filesystem.add_option('--write-description',
3520 action='store_true', dest='writedescription',
3521 help='write video description to a .description file', default=False)
3522 filesystem.add_option('--write-info-json',
3523 action='store_true', dest='writeinfojson',
3524 help='write video metadata to a .info.json file', default=False)
3527 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3528 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3529 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3530 help='"best", "aac" or "mp3"; best by default')
3533 parser.add_option_group(general)
3534 parser.add_option_group(selection)
3535 parser.add_option_group(filesystem)
3536 parser.add_option_group(verbosity)
3537 parser.add_option_group(video_format)
3538 parser.add_option_group(authentication)
3539 parser.add_option_group(postproc)
3541 opts, args = parser.parse_args()
3543 return parser, opts, args
3546 parser, opts, args = parseOpts()
3548 # Open appropriate CookieJar
3549 if opts.cookiefile is None:
3550 jar = cookielib.CookieJar()
3553 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3554 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3556 except (IOError, OSError), err:
3557 sys.exit(u'ERROR: unable to open cookie file')
3560 if opts.dump_user_agent:
3561 print std_headers['User-Agent']
3564 # General configuration
3565 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3566 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3567 urllib2.install_opener(opener)
3568 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3570 # Batch file verification
3572 if opts.batchfile is not None:
3574 if opts.batchfile == '-':
3577 batchfd = open(opts.batchfile, 'r')
3578 batchurls = batchfd.readlines()
3579 batchurls = [x.strip() for x in batchurls]
3580 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3582 sys.exit(u'ERROR: batch file could not be read')
3583 all_urls = batchurls + args
3585 # Conflicting, missing and erroneous options
3586 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3587 parser.error(u'using .netrc conflicts with giving username/password')
3588 if opts.password is not None and opts.username is None:
3589 parser.error(u'account username missing')
3590 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3591 parser.error(u'using output template conflicts with using title, literal title or auto number')
3592 if opts.usetitle and opts.useliteral:
3593 parser.error(u'using title conflicts with using literal title')
3594 if opts.username is not None and opts.password is None:
3595 opts.password = getpass.getpass(u'Type account password and press return:')
3596 if opts.ratelimit is not None:
3597 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3598 if numeric_limit is None:
3599 parser.error(u'invalid rate limit specified')
3600 opts.ratelimit = numeric_limit
3601 if opts.retries is not None:
3603 opts.retries = long(opts.retries)
3604 except (TypeError, ValueError), err:
3605 parser.error(u'invalid retry count specified')
3607 opts.playliststart = int(opts.playliststart)
3608 if opts.playliststart <= 0:
3609 raise ValueError(u'Playlist start must be positive')
3610 except (TypeError, ValueError), err:
3611 parser.error(u'invalid playlist start number specified')
3613 opts.playlistend = int(opts.playlistend)
3614 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3615 raise ValueError(u'Playlist end must be greater than playlist start')
3616 except (TypeError, ValueError), err:
3617 parser.error(u'invalid playlist end number specified')
3618 if opts.extractaudio:
3619 if opts.audioformat not in ['best', 'aac', 'mp3']:
3620 parser.error(u'invalid audio format specified')
3622 # Information extractors
3623 youtube_ie = YoutubeIE()
3624 google_ie = GoogleIE()
3625 yahoo_ie = YahooIE()
3626 extractors = [ # Order does matter
3628 MetacafeIE(youtube_ie),
3630 YoutubePlaylistIE(youtube_ie),
3631 YoutubeUserIE(youtube_ie),
3632 YoutubeSearchIE(youtube_ie),
3634 GoogleSearchIE(google_ie),
3637 YahooSearchIE(yahoo_ie),
3650 fd = FileDownloader({
3651 'usenetrc': opts.usenetrc,
3652 'username': opts.username,
3653 'password': opts.password,
3654 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3655 'forceurl': opts.geturl,
3656 'forcetitle': opts.gettitle,
3657 'forcethumbnail': opts.getthumbnail,
3658 'forcedescription': opts.getdescription,
3659 'forcefilename': opts.getfilename,
3660 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3661 'format': opts.format,
3662 'format_limit': opts.format_limit,
3663 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3664 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3665 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3666 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3667 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3668 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3669 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3670 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3671 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3672 or u'%(id)s.%(ext)s'),
3673 'ignoreerrors': opts.ignoreerrors,
3674 'ratelimit': opts.ratelimit,
3675 'nooverwrites': opts.nooverwrites,
3676 'retries': opts.retries,
3677 'continuedl': opts.continue_dl,
3678 'noprogress': opts.noprogress,
3679 'playliststart': opts.playliststart,
3680 'playlistend': opts.playlistend,
3681 'logtostderr': opts.outtmpl == '-',
3682 'consoletitle': opts.consoletitle,
3683 'nopart': opts.nopart,
3684 'updatetime': opts.updatetime,
3685 'writedescription': opts.writedescription,
3686 'writeinfojson': opts.writeinfojson,
3687 'matchtitle': opts.matchtitle,
3688 'rejecttitle': opts.rejecttitle,
3690 for extractor in extractors:
3691 fd.add_info_extractor(extractor)
3694 if opts.extractaudio:
3695 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3698 if opts.update_self:
3699 updateSelf(fd, sys.argv[0])
3702 if len(all_urls) < 1:
3703 if not opts.update_self:
3704 parser.error(u'you must provide at least one URL')
3707 retcode = fd.download(all_urls)
3709 # Dump cookie jar if requested
3710 if opts.cookiefile is not None:
3713 except (IOError, OSError), err:
3714 sys.exit(u'ERROR: unable to save cookie jar')
3719 if __name__ == '__main__':
3722 except DownloadError:
3724 except SameFileError:
3725 sys.exit(u'ERROR: fixed output name but more than one file to download')
3726 except KeyboardInterrupt:
3727 sys.exit(u'\nERROR: Interrupted by user')
3729 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: