2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.06-phihag'
20 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
48 except ImportError: # Python 2.4
51 import cStringIO as StringIO
55 # parse_qs was moved from the cgi module to the urlparse module recently.
57 from urlparse import parse_qs
59 from cgi import parse_qs
67 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
68 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
69 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
70 'Accept-Encoding': 'gzip, deflate',
71 'Accept-Language': 'en-us,en;q=0.5',
74 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
78 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
84 def raiseError(msg, i):
85 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
86 def skipSpace(i, expectMore=True):
87 while i < len(s) and s[i] in ' \t\r\n':
91 raiseError('Premature end', i)
93 def decodeEscape(match):
109 return unichr(int(esc[1:5], 16))
110 if len(esc) == 5+6 and esc[5:7] == '\\u':
111 hi = int(esc[1:5], 16)
112 low = int(esc[7:11], 16)
113 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
114 raise ValueError('Unknown escape ' + str(esc))
121 while s[e-bslashes-1] == '\\':
123 if bslashes % 2 == 1:
127 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
128 stri = rexp.sub(decodeEscape, s[i:e])
134 if s[i] == '}': # Empty dictionary
138 raiseError('Expected a string object key', i)
139 i,key = parseString(i)
141 if i >= len(s) or s[i] != ':':
142 raiseError('Expected a colon', i)
149 raiseError('Expected comma or closing curly brace', i)
154 if s[i] == ']': # Empty array
159 i = skipSpace(i) # Raise exception if premature end
163 raiseError('Expected a comma or closing bracket', i)
165 def parseDiscrete(i):
166 for k,v in {'true': True, 'false': False, 'null': None}.items():
167 if s.startswith(k, i):
169 raiseError('Not a boolean (or null)', i)
171 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
173 raiseError('Not a number', i)
175 if '.' in nums or 'e' in nums or 'E' in nums:
176 return (i+len(nums), float(nums))
177 return (i+len(nums), int(nums))
178 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
181 i,res = CHARMAP.get(s[i], parseNumber)(i)
182 i = skipSpace(i, False)
186 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
189 def preferredencoding():
190 """Get preferred encoding.
192 Returns the best encoding scheme for the system, based on
193 locale.getpreferredencoding() and some further tweaks.
195 def yield_preferredencoding():
197 pref = locale.getpreferredencoding()
203 return yield_preferredencoding().next()
206 def htmlentity_transform(matchobj):
207 """Transforms an HTML entity to a Unicode character.
209 This function receives a match object and is intended to be used with
210 the re.sub() function.
212 entity = matchobj.group(1)
214 # Known non-numeric HTML entity
215 if entity in htmlentitydefs.name2codepoint:
216 return unichr(htmlentitydefs.name2codepoint[entity])
219 mobj = re.match(ur'(?u)#(x?\d+)', entity)
221 numstr = mobj.group(1)
222 if numstr.startswith(u'x'):
224 numstr = u'0%s' % numstr
227 return unichr(long(numstr, base))
229 # Unknown entity in name, return its literal representation
230 return (u'&%s;' % entity)
233 def sanitize_title(utitle):
234 """Sanitizes a video title so it could be used as part of a filename."""
235 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
236 return utitle.replace(unicode(os.sep), u'%')
239 def sanitize_open(filename, open_mode):
240 """Try to open the given filename, and slightly tweak it if this fails.
242 Attempts to open the given filename. If this fails, it tries to change
243 the filename slightly, step by step, until it's either able to open it
244 or it fails and raises a final exception, like the standard open()
247 It returns the tuple (stream, definitive_file_name).
251 if sys.platform == 'win32':
253 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
254 return (sys.stdout, filename)
255 stream = open(filename, open_mode)
256 return (stream, filename)
257 except (IOError, OSError), err:
258 # In case of error, try to remove win32 forbidden chars
259 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
261 # An exception here should be caught in the caller
262 stream = open(filename, open_mode)
263 return (stream, filename)
266 def timeconvert(timestr):
267 """Convert RFC 2822 defined time string into system timestamp"""
269 timetuple = email.utils.parsedate_tz(timestr)
270 if timetuple is not None:
271 timestamp = email.utils.mktime_tz(timetuple)
275 class DownloadError(Exception):
276 """Download Error exception.
278 This exception may be thrown by FileDownloader objects if they are not
279 configured to continue on errors. They will contain the appropriate
285 class SameFileError(Exception):
286 """Same File exception.
288 This exception will be thrown by FileDownloader objects if they detect
289 multiple files would have to be downloaded to the same file on disk.
294 class PostProcessingError(Exception):
295 """Post Processing exception.
297 This exception may be raised by PostProcessor's .run() method to
298 indicate an error in the postprocessing task.
303 class UnavailableVideoError(Exception):
304 """Unavailable Format exception.
306 This exception will be thrown when a video is requested
307 in a format that is not available for that video.
312 class ContentTooShortError(Exception):
313 """Content Too Short exception.
315 This exception may be raised by FileDownloader objects when a file they
316 download is too small for what the server announced first, indicating
317 the connection was probably interrupted.
323 def __init__(self, downloaded, expected):
324 self.downloaded = downloaded
325 self.expected = expected
328 class YoutubeDLHandler(urllib2.HTTPHandler):
329 """Handler for HTTP requests and responses.
331 This class, when installed with an OpenerDirector, automatically adds
332 the standard headers to every HTTP request and handles gzipped and
333 deflated responses from web servers. If compression is to be avoided in
334 a particular request, the original request in the program code only has
335 to include the HTTP header "Youtubedl-No-Compression", which will be
336 removed before making the real request.
338 Part of this code was copied from:
340 http://techknack.net/python-urllib2-handlers/
342 Andrew Rowls, the author of that code, agreed to release it to the
349 return zlib.decompress(data, -zlib.MAX_WBITS)
351 return zlib.decompress(data)
354 def addinfourl_wrapper(stream, headers, url, code):
355 if hasattr(urllib2.addinfourl, 'getcode'):
356 return urllib2.addinfourl(stream, headers, url, code)
357 ret = urllib2.addinfourl(stream, headers, url)
361 def http_request(self, req):
362 for h in std_headers:
365 req.add_header(h, std_headers[h])
366 if 'Youtubedl-no-compression' in req.headers:
367 if 'Accept-encoding' in req.headers:
368 del req.headers['Accept-encoding']
369 del req.headers['Youtubedl-no-compression']
372 def http_response(self, req, resp):
375 if resp.headers.get('Content-encoding', '') == 'gzip':
376 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
377 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
378 resp.msg = old_resp.msg
380 if resp.headers.get('Content-encoding', '') == 'deflate':
381 gz = StringIO.StringIO(self.deflate(resp.read()))
382 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
383 resp.msg = old_resp.msg
387 class FileDownloader(object):
388 """File Downloader class.
390 File downloader objects are the ones responsible of downloading the
391 actual video file and writing it to disk if the user has requested
392 it, among some other tasks. In most cases there should be one per
393 program. As, given a video URL, the downloader doesn't know how to
394 extract all the needed information, task that InfoExtractors do, it
395 has to pass the URL to one of them.
397 For this, file downloader objects have a method that allows
398 InfoExtractors to be registered in a given order. When it is passed
399 a URL, the file downloader handles it to the first InfoExtractor it
400 finds that reports being able to handle it. The InfoExtractor extracts
401 all the information about the video or videos the URL refers to, and
402 asks the FileDownloader to process the video information, possibly
403 downloading the video.
405 File downloaders accept a lot of parameters. In order not to saturate
406 the object constructor with arguments, it receives a dictionary of
407 options instead. These options are available through the params
408 attribute for the InfoExtractors to use. The FileDownloader also
409 registers itself as the downloader in charge for the InfoExtractors
410 that are added to it, so this is a "mutual registration".
414 username: Username for authentication purposes.
415 password: Password for authentication purposes.
416 usenetrc: Use netrc for authentication instead.
417 quiet: Do not print messages to stdout.
418 forceurl: Force printing final URL.
419 forcetitle: Force printing title.
420 forcethumbnail: Force printing thumbnail URL.
421 forcedescription: Force printing description.
422 forcefilename: Force printing final filename.
423 simulate: Do not download the video files.
424 format: Video format code.
425 format_limit: Highest quality format to try.
426 outtmpl: Template for output names.
427 ignoreerrors: Do not stop on download errors.
428 ratelimit: Download speed limit, in bytes/sec.
429 nooverwrites: Prevent overwriting files.
430 retries: Number of times to retry for HTTP error 5xx
431 continuedl: Try to continue downloads if possible.
432 noprogress: Do not print the progress bar.
433 playliststart: Playlist item to start at.
434 playlistend: Playlist item to end at.
435 logtostderr: Log messages to stderr instead of stdout.
436 consoletitle: Display progress in console window's titlebar.
437 nopart: Do not use temporary .part files.
438 updatetime: Use the Last-modified header to set output file timestamps.
439 writedescription: Write the video description to a .description file
440 writeinfojson: Write the video description to a .info.json file
446 _download_retcode = None
447 _num_downloads = None
450 def __init__(self, params):
451 """Create a FileDownloader object with the given options."""
454 self._download_retcode = 0
455 self._num_downloads = 0
456 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
460 def format_bytes(bytes):
463 if type(bytes) is str:
468 exponent = long(math.log(bytes, 1024.0))
469 suffix = 'bkMGTPEZY'[exponent]
470 converted = float(bytes) / float(1024 ** exponent)
471 return '%.2f%s' % (converted, suffix)
474 def calc_percent(byte_counter, data_len):
477 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
480 def calc_eta(start, now, total, current):
484 if current == 0 or dif < 0.001: # One millisecond
486 rate = float(current) / dif
487 eta = long((float(total) - float(current)) / rate)
488 (eta_mins, eta_secs) = divmod(eta, 60)
491 return '%02d:%02d' % (eta_mins, eta_secs)
494 def calc_speed(start, now, bytes):
496 if bytes == 0 or dif < 0.001: # One millisecond
497 return '%10s' % '---b/s'
498 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
501 def best_block_size(elapsed_time, bytes):
502 new_min = max(bytes / 2.0, 1.0)
503 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
504 if elapsed_time < 0.001:
506 rate = bytes / elapsed_time
514 def parse_bytes(bytestr):
515 """Parse a string indicating a byte quantity into a long integer."""
516 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
519 number = float(matchobj.group(1))
520 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
521 return long(round(number * multiplier))
523 def add_info_extractor(self, ie):
524 """Add an InfoExtractor object to the end of the list."""
526 ie.set_downloader(self)
528 def add_post_processor(self, pp):
529 """Add a PostProcessor object to the end of the chain."""
531 pp.set_downloader(self)
533 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
534 """Print message to stdout if not in quiet mode."""
536 if not self.params.get('quiet', False):
537 terminator = [u'\n', u''][skip_eol]
538 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
539 self._screen_file.flush()
540 except (UnicodeEncodeError), err:
541 if not ignore_encoding_errors:
544 def to_stderr(self, message):
545 """Print message to stderr."""
546 print >>sys.stderr, message.encode(preferredencoding())
548 def to_cons_title(self, message):
549 """Set console/terminal window title to message."""
550 if not self.params.get('consoletitle', False):
552 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
553 # c_wchar_p() might not be necessary if `message` is
554 # already of type unicode()
555 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
556 elif 'TERM' in os.environ:
557 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
559 def fixed_template(self):
560 """Checks if the output template is fixed."""
561 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
563 def trouble(self, message=None):
564 """Determine action to take when a download problem appears.
566 Depending on if the downloader has been configured to ignore
567 download errors or not, this method may throw an exception or
568 not when errors are found, after printing the message.
570 if message is not None:
571 self.to_stderr(message)
572 if not self.params.get('ignoreerrors', False):
573 raise DownloadError(message)
574 self._download_retcode = 1
576 def slow_down(self, start_time, byte_counter):
577 """Sleep if the download speed is over the rate limit."""
578 rate_limit = self.params.get('ratelimit', None)
579 if rate_limit is None or byte_counter == 0:
582 elapsed = now - start_time
585 speed = float(byte_counter) / elapsed
586 if speed > rate_limit:
587 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
589 def temp_name(self, filename):
590 """Returns a temporary filename for the given filename."""
591 if self.params.get('nopart', False) or filename == u'-' or \
592 (os.path.exists(filename) and not os.path.isfile(filename)):
594 return filename + u'.part'
596 def undo_temp_name(self, filename):
597 if filename.endswith(u'.part'):
598 return filename[:-len(u'.part')]
601 def try_rename(self, old_filename, new_filename):
603 if old_filename == new_filename:
605 os.rename(old_filename, new_filename)
606 except (IOError, OSError), err:
607 self.trouble(u'ERROR: unable to rename file')
609 def try_utime(self, filename, last_modified_hdr):
610 """Try to set the last-modified time of the given file."""
611 if last_modified_hdr is None:
613 if not os.path.isfile(filename):
615 timestr = last_modified_hdr
618 filetime = timeconvert(timestr)
622 os.utime(filename, (time.time(), filetime))
626 def report_writedescription(self, descfn):
627 """ Report that the description file is being written """
628 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
630 def report_writeinfojson(self, infofn):
631 """ Report that the metadata file has been written """
632 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
634 def report_destination(self, filename):
635 """Report destination filename."""
636 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
638 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
639 """Report download progress."""
640 if self.params.get('noprogress', False):
642 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
643 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
644 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
645 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
647 def report_resuming_byte(self, resume_len):
648 """Report attempt to resume at given byte."""
649 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
651 def report_retry(self, count, retries):
652 """Report retry in case of HTTP error 5xx"""
653 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
655 def report_file_already_downloaded(self, file_name):
656 """Report file has already been fully downloaded."""
658 self.to_screen(u'[download] %s has already been downloaded' % file_name)
659 except (UnicodeEncodeError), err:
660 self.to_screen(u'[download] The file has already been downloaded')
662 def report_unable_to_resume(self):
663 """Report it was impossible to resume download."""
664 self.to_screen(u'[download] Unable to resume')
666 def report_finish(self):
667 """Report download finished."""
668 if self.params.get('noprogress', False):
669 self.to_screen(u'[download] Download completed')
673 def increment_downloads(self):
674 """Increment the ordinal that assigns a number to each file."""
675 self._num_downloads += 1
677 def prepare_filename(self, info_dict):
678 """Generate the output filename."""
680 template_dict = dict(info_dict)
681 template_dict['epoch'] = unicode(long(time.time()))
682 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
683 filename = self.params['outtmpl'] % template_dict
685 except (ValueError, KeyError), err:
686 self.trouble(u'ERROR: invalid system charset or erroneous output template')
689 def process_info(self, info_dict):
690 """Process a single dictionary returned by an InfoExtractor."""
691 filename = self.prepare_filename(info_dict)
692 # Do nothing else if in simulate mode
693 if self.params.get('simulate', False):
695 if self.params.get('forcetitle', False):
696 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
697 if self.params.get('forceurl', False):
698 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
699 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
700 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
701 if self.params.get('forcedescription', False) and 'description' in info_dict:
702 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
703 if self.params.get('forcefilename', False) and filename is not None:
704 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
710 if self.params.get('nooverwrites', False) and os.path.exists(filename):
711 self.to_stderr(u'WARNING: file exists and will be skipped')
715 dn = os.path.dirname(filename)
716 if dn != '' and not os.path.exists(dn):
718 except (OSError, IOError), err:
719 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
722 if self.params.get('writedescription', False):
724 descfn = filename + '.description'
725 self.report_writedescription(descfn)
726 descfile = open(descfn, 'wb')
728 descfile.write(info_dict['description'].encode('utf-8'))
731 except (OSError, IOError):
732 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
735 if self.params.get('writeinfojson', False):
736 infofn = filename + '.info.json'
737 self.report_writeinfojson(infofn)
740 except (NameError,AttributeError):
741 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
744 infof = open(infofn, 'wb')
746 json.dump(info_dict, infof)
749 except (OSError, IOError):
750 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
754 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
755 except (OSError, IOError), err:
756 raise UnavailableVideoError
757 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
758 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
760 except (ContentTooShortError, ), err:
761 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
766 self.post_process(filename, info_dict)
767 except (PostProcessingError), err:
768 self.trouble(u'ERROR: postprocessing: %s' % str(err))
771 def download(self, url_list):
772 """Download a given list of URLs."""
773 if len(url_list) > 1 and self.fixed_template():
774 raise SameFileError(self.params['outtmpl'])
777 suitable_found = False
779 # Go to next InfoExtractor if not suitable
780 if not ie.suitable(url):
783 # Suitable InfoExtractor found
784 suitable_found = True
786 # Extract information from URL and process it
789 # Suitable InfoExtractor had been found; go to next URL
792 if not suitable_found:
793 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
795 return self._download_retcode
797 def post_process(self, filename, ie_info):
798 """Run the postprocessing chain on the given file."""
800 info['filepath'] = filename
806 def _download_with_rtmpdump(self, filename, url, player_url):
807 self.report_destination(filename)
808 tmpfilename = self.temp_name(filename)
810 # Check for rtmpdump first
812 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
813 except (OSError, IOError):
814 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
817 # Download using rtmpdump. rtmpdump returns exit code 2 when
818 # the connection was interrumpted and resuming appears to be
819 # possible. This is part of rtmpdump's normal usage, AFAIK.
820 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
821 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
822 while retval == 2 or retval == 1:
823 prevsize = os.path.getsize(tmpfilename)
824 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
825 time.sleep(5.0) # This seems to be needed
826 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
827 cursize = os.path.getsize(tmpfilename)
828 if prevsize == cursize and retval == 1:
831 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
832 self.try_rename(tmpfilename, filename)
835 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
838 def _do_download(self, filename, url, player_url):
839 # Check file already present
840 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
841 self.report_file_already_downloaded(filename)
844 # Attempt to download using rtmpdump
845 if url.startswith('rtmp'):
846 return self._download_with_rtmpdump(filename, url, player_url)
848 tmpfilename = self.temp_name(filename)
852 # Do not include the Accept-Encoding header
853 headers = {'Youtubedl-no-compression': 'True'}
854 basic_request = urllib2.Request(url, None, headers)
855 request = urllib2.Request(url, None, headers)
857 # Establish possible resume length
858 if os.path.isfile(tmpfilename):
859 resume_len = os.path.getsize(tmpfilename)
863 # Request parameters in case of being able to resume
864 if self.params.get('continuedl', False) and resume_len != 0:
865 self.report_resuming_byte(resume_len)
866 request.add_header('Range', 'bytes=%d-' % resume_len)
870 retries = self.params.get('retries', 0)
871 while count <= retries:
872 # Establish connection
874 data = urllib2.urlopen(request)
876 except (urllib2.HTTPError, ), err:
877 if (err.code < 500 or err.code >= 600) and err.code != 416:
878 # Unexpected HTTP error
880 elif err.code == 416:
881 # Unable to resume (requested range not satisfiable)
883 # Open the connection again without the range header
884 data = urllib2.urlopen(basic_request)
885 content_length = data.info()['Content-Length']
886 except (urllib2.HTTPError, ), err:
887 if err.code < 500 or err.code >= 600:
890 # Examine the reported length
891 if (content_length is not None and
892 (resume_len - 100 < long(content_length) < resume_len + 100)):
893 # The file had already been fully downloaded.
894 # Explanation to the above condition: in issue #175 it was revealed that
895 # YouTube sometimes adds or removes a few bytes from the end of the file,
896 # changing the file size slightly and causing problems for some users. So
897 # I decided to implement a suggested change and consider the file
898 # completely downloaded if the file size differs less than 100 bytes from
899 # the one in the hard drive.
900 self.report_file_already_downloaded(filename)
901 self.try_rename(tmpfilename, filename)
904 # The length does not match, we start the download over
905 self.report_unable_to_resume()
911 self.report_retry(count, retries)
914 self.trouble(u'ERROR: giving up after %s retries' % retries)
917 data_len = data.info().get('Content-length', None)
918 if data_len is not None:
919 data_len = long(data_len) + resume_len
920 data_len_str = self.format_bytes(data_len)
921 byte_counter = 0 + resume_len
927 data_block = data.read(block_size)
929 if len(data_block) == 0:
931 byte_counter += len(data_block)
933 # Open file just in time
936 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
937 assert stream is not None
938 filename = self.undo_temp_name(tmpfilename)
939 self.report_destination(filename)
940 except (OSError, IOError), err:
941 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
944 stream.write(data_block)
945 except (IOError, OSError), err:
946 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
948 block_size = self.best_block_size(after - before, len(data_block))
951 percent_str = self.calc_percent(byte_counter, data_len)
952 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
953 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
954 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
957 self.slow_down(start, byte_counter - resume_len)
960 self.trouble(u'\nERROR: Did not get any data blocks')
964 if data_len is not None and byte_counter != data_len:
965 raise ContentTooShortError(byte_counter, long(data_len))
966 self.try_rename(tmpfilename, filename)
968 # Update file modification time
969 if self.params.get('updatetime', True):
970 self.try_utime(filename, data.info().get('last-modified', None))
975 class InfoExtractor(object):
976 """Information Extractor class.
978 Information extractors are the classes that, given a URL, extract
979 information from the video (or videos) the URL refers to. This
980 information includes the real video URL, the video title and simplified
981 title, author and others. The information is stored in a dictionary
982 which is then passed to the FileDownloader. The FileDownloader
983 processes this information possibly downloading the video to the file
984 system, among other possible outcomes. The dictionaries must include
985 the following fields:
987 id: Video identifier.
988 url: Final video URL.
989 uploader: Nickname of the video uploader.
990 title: Literal title.
991 stitle: Simplified title.
992 ext: Video filename extension.
993 format: Video format.
994 player_url: SWF Player URL (may be None).
996 The following fields are optional. Their primary purpose is to allow
997 youtube-dl to serve as the backend for a video search function, such
998 as the one in youtube2mp3. They are only used when their respective
999 forced printing functions are called:
1001 thumbnail: Full URL to a video thumbnail image.
1002 description: One-line video description.
1004 Subclasses of this one should re-define the _real_initialize() and
1005 _real_extract() methods, as well as the suitable() static method.
1006 Probably, they should also be instantiated and added to the main
1013 def __init__(self, downloader=None):
1014 """Constructor. Receives an optional downloader."""
1016 self.set_downloader(downloader)
1020 """Receives a URL and returns True if suitable for this IE."""
1023 def initialize(self):
1024 """Initializes an instance (authentication, etc)."""
1026 self._real_initialize()
1029 def extract(self, url):
1030 """Extracts URL information and returns it in list of dicts."""
1032 return self._real_extract(url)
1034 def set_downloader(self, downloader):
1035 """Sets the downloader for this IE."""
1036 self._downloader = downloader
1038 def _real_initialize(self):
1039 """Real initialization process. Redefine in subclasses."""
1042 def _real_extract(self, url):
1043 """Real extraction process. Redefine in subclasses."""
1047 class YoutubeIE(InfoExtractor):
1048 """Information extractor for youtube.com."""
1050 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1051 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1052 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1053 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1054 _NETRC_MACHINE = 'youtube'
1055 # Listed in order of quality
1056 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1057 _video_extensions = {
1063 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1070 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1072 def report_lang(self):
1073 """Report attempt to set language."""
1074 self._downloader.to_screen(u'[youtube] Setting language')
1076 def report_login(self):
1077 """Report attempt to log in."""
1078 self._downloader.to_screen(u'[youtube] Logging in')
1080 def report_age_confirmation(self):
1081 """Report attempt to confirm age."""
1082 self._downloader.to_screen(u'[youtube] Confirming age')
1084 def report_video_webpage_download(self, video_id):
1085 """Report attempt to download video webpage."""
1086 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1088 def report_video_info_webpage_download(self, video_id):
1089 """Report attempt to download video info webpage."""
1090 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1092 def report_information_extraction(self, video_id):
1093 """Report attempt to extract video information."""
1094 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1096 def report_unavailable_format(self, video_id, format):
1097 """Report extracted video URL."""
1098 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1100 def report_rtmp_download(self):
1101 """Indicate the download will use the RTMP protocol."""
1102 self._downloader.to_screen(u'[youtube] RTMP download detected')
1104 def _real_initialize(self):
1105 if self._downloader is None:
1110 downloader_params = self._downloader.params
1112 # Attempt to use provided username and password or .netrc data
1113 if downloader_params.get('username', None) is not None:
1114 username = downloader_params['username']
1115 password = downloader_params['password']
1116 elif downloader_params.get('usenetrc', False):
1118 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1119 if info is not None:
1123 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1124 except (IOError, netrc.NetrcParseError), err:
1125 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1129 request = urllib2.Request(self._LANG_URL)
1132 urllib2.urlopen(request).read()
1133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1134 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1137 # No authentication to be performed
1138 if username is None:
1143 'current_form': 'loginForm',
1145 'action_login': 'Log In',
1146 'username': username,
1147 'password': password,
1149 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1152 login_results = urllib2.urlopen(request).read()
1153 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1154 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1156 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1157 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1163 'action_confirm': 'Confirm',
1165 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1167 self.report_age_confirmation()
1168 age_results = urllib2.urlopen(request).read()
1169 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1170 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1173 def _real_extract(self, url):
1174 # Extract video id from URL
1175 mobj = re.match(self._VALID_URL, url)
1177 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1179 video_id = mobj.group(2)
1182 self.report_video_webpage_download(video_id)
1183 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1185 video_webpage = urllib2.urlopen(request).read()
1186 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1187 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1190 # Attempt to extract SWF player URL
1191 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1192 if mobj is not None:
1193 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1198 self.report_video_info_webpage_download(video_id)
1199 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1200 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1201 % (video_id, el_type))
1202 request = urllib2.Request(video_info_url)
1204 video_info_webpage = urllib2.urlopen(request).read()
1205 video_info = parse_qs(video_info_webpage)
1206 if 'token' in video_info:
1208 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1209 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1211 if 'token' not in video_info:
1212 if 'reason' in video_info:
1213 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1215 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1218 # Start extracting information
1219 self.report_information_extraction(video_id)
1222 if 'author' not in video_info:
1223 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1225 video_uploader = urllib.unquote_plus(video_info['author'][0])
1228 if 'title' not in video_info:
1229 self._downloader.trouble(u'ERROR: unable to extract video title')
1231 video_title = urllib.unquote_plus(video_info['title'][0])
1232 video_title = video_title.decode('utf-8')
1233 video_title = sanitize_title(video_title)
1236 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1237 simple_title = simple_title.strip(ur'_')
1240 if 'thumbnail_url' not in video_info:
1241 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1242 video_thumbnail = ''
1243 else: # don't panic if we can't find it
1244 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1248 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1249 if mobj is not None:
1250 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1251 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1252 for expression in format_expressions:
1254 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1262 video_description = u'No description available.'
1263 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1264 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1265 if mobj is not None:
1266 video_description = mobj.group(1).decode('utf-8')
1268 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1269 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1270 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1271 # TODO use another parser
1274 video_token = urllib.unquote_plus(video_info['token'][0])
1276 # Decide which formats to download
1277 req_format = self._downloader.params.get('format', None)
1279 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1280 self.report_rtmp_download()
1281 video_url_list = [(None, video_info['conn'][0])]
1282 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1283 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1284 url_data = [parse_qs(uds) for uds in url_data_strs]
1285 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1286 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1288 format_limit = self._downloader.params.get('format_limit', None)
1289 if format_limit is not None and format_limit in self._available_formats:
1290 format_list = self._available_formats[self._available_formats.index(format_limit):]
1292 format_list = self._available_formats
1293 existing_formats = [x for x in format_list if x in url_map]
1294 if len(existing_formats) == 0:
1295 self._downloader.trouble(u'ERROR: no known formats available for video')
1297 if req_format is None:
1298 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1299 elif req_format == '-1':
1300 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1303 if req_format not in url_map:
1304 self._downloader.trouble(u'ERROR: requested format not available')
1306 video_url_list = [(req_format, url_map[req_format])] # Specific format
1308 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1311 for format_param, video_real_url in video_url_list:
1312 # At this point we have a new video
1313 self._downloader.increment_downloads()
1316 video_extension = self._video_extensions.get(format_param, 'flv')
1319 # Process video information
1320 self._downloader.process_info({
1321 'id': video_id.decode('utf-8'),
1322 'url': video_real_url.decode('utf-8'),
1323 'uploader': video_uploader.decode('utf-8'),
1324 'upload_date': upload_date,
1325 'title': video_title,
1326 'stitle': simple_title,
1327 'ext': video_extension.decode('utf-8'),
1328 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1329 'thumbnail': video_thumbnail.decode('utf-8'),
1330 'description': video_description,
1331 'player_url': player_url,
1333 except UnavailableVideoError, err:
1334 self._downloader.trouble(u'\nERROR: unable to download video')
1337 class MetacafeIE(InfoExtractor):
1338 """Information Extractor for metacafe.com."""
1340 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1341 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1342 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1345 def __init__(self, youtube_ie, downloader=None):
1346 InfoExtractor.__init__(self, downloader)
1347 self._youtube_ie = youtube_ie
1351 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1353 def report_disclaimer(self):
1354 """Report disclaimer retrieval."""
1355 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1357 def report_age_confirmation(self):
1358 """Report attempt to confirm age."""
1359 self._downloader.to_screen(u'[metacafe] Confirming age')
1361 def report_download_webpage(self, video_id):
1362 """Report webpage download."""
1363 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1365 def report_extraction(self, video_id):
1366 """Report information extraction."""
1367 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1369 def _real_initialize(self):
1370 # Retrieve disclaimer
1371 request = urllib2.Request(self._DISCLAIMER)
1373 self.report_disclaimer()
1374 disclaimer = urllib2.urlopen(request).read()
1375 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1376 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1382 'submit': "Continue - I'm over 18",
1384 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1386 self.report_age_confirmation()
1387 disclaimer = urllib2.urlopen(request).read()
1388 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1389 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1392 def _real_extract(self, url):
1393 # Extract id and simplified title from URL
1394 mobj = re.match(self._VALID_URL, url)
1396 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1399 video_id = mobj.group(1)
1401 # Check if video comes from YouTube
1402 mobj2 = re.match(r'^yt-(.*)$', video_id)
1403 if mobj2 is not None:
1404 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1407 # At this point we have a new video
1408 self._downloader.increment_downloads()
1410 simple_title = mobj.group(2).decode('utf-8')
1412 # Retrieve video webpage to extract further information
1413 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1415 self.report_download_webpage(video_id)
1416 webpage = urllib2.urlopen(request).read()
1417 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1418 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1421 # Extract URL, uploader and title from webpage
1422 self.report_extraction(video_id)
1423 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1424 if mobj is not None:
1425 mediaURL = urllib.unquote(mobj.group(1))
1426 video_extension = mediaURL[-3:]
1428 # Extract gdaKey if available
1429 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1431 video_url = mediaURL
1433 gdaKey = mobj.group(1)
1434 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1436 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1438 self._downloader.trouble(u'ERROR: unable to extract media URL')
1440 vardict = parse_qs(mobj.group(1))
1441 if 'mediaData' not in vardict:
1442 self._downloader.trouble(u'ERROR: unable to extract media URL')
1444 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1446 self._downloader.trouble(u'ERROR: unable to extract media URL')
1448 mediaURL = mobj.group(1).replace('\\/', '/')
1449 video_extension = mediaURL[-3:]
1450 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1452 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1454 self._downloader.trouble(u'ERROR: unable to extract title')
1456 video_title = mobj.group(1).decode('utf-8')
1457 video_title = sanitize_title(video_title)
1459 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1461 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1463 video_uploader = mobj.group(1)
1466 # Process video information
1467 self._downloader.process_info({
1468 'id': video_id.decode('utf-8'),
1469 'url': video_url.decode('utf-8'),
1470 'uploader': video_uploader.decode('utf-8'),
1471 'upload_date': u'NA',
1472 'title': video_title,
1473 'stitle': simple_title,
1474 'ext': video_extension.decode('utf-8'),
1478 except UnavailableVideoError:
1479 self._downloader.trouble(u'\nERROR: unable to download video')
1482 class DailymotionIE(InfoExtractor):
1483 """Information Extractor for Dailymotion"""
1485 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1487 def __init__(self, downloader=None):
1488 InfoExtractor.__init__(self, downloader)
1492 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1494 def report_download_webpage(self, video_id):
1495 """Report webpage download."""
1496 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1498 def report_extraction(self, video_id):
1499 """Report information extraction."""
1500 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1502 def _real_initialize(self):
1505 def _real_extract(self, url):
1506 # Extract id and simplified title from URL
1507 mobj = re.match(self._VALID_URL, url)
1509 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1512 # At this point we have a new video
1513 self._downloader.increment_downloads()
1514 video_id = mobj.group(1)
1516 simple_title = mobj.group(2).decode('utf-8')
1517 video_extension = 'flv'
1519 # Retrieve video webpage to extract further information
1520 request = urllib2.Request(url)
1522 self.report_download_webpage(video_id)
1523 webpage = urllib2.urlopen(request).read()
1524 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1525 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1528 # Extract URL, uploader and title from webpage
1529 self.report_extraction(video_id)
1530 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1532 self._downloader.trouble(u'ERROR: unable to extract media URL')
1534 mediaURL = urllib.unquote(mobj.group(1))
1536 # if needed add http://www.dailymotion.com/ if relative URL
1538 video_url = mediaURL
1540 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1541 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1543 self._downloader.trouble(u'ERROR: unable to extract title')
1545 video_title = mobj.group(1).decode('utf-8')
1546 video_title = sanitize_title(video_title)
1548 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1550 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1552 video_uploader = mobj.group(1)
1555 # Process video information
1556 self._downloader.process_info({
1557 'id': video_id.decode('utf-8'),
1558 'url': video_url.decode('utf-8'),
1559 'uploader': video_uploader.decode('utf-8'),
1560 'upload_date': u'NA',
1561 'title': video_title,
1562 'stitle': simple_title,
1563 'ext': video_extension.decode('utf-8'),
1567 except UnavailableVideoError:
1568 self._downloader.trouble(u'\nERROR: unable to download video')
1571 class GoogleIE(InfoExtractor):
1572 """Information extractor for video.google.com."""
1574 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1576 def __init__(self, downloader=None):
1577 InfoExtractor.__init__(self, downloader)
1581 return (re.match(GoogleIE._VALID_URL, url) is not None)
1583 def report_download_webpage(self, video_id):
1584 """Report webpage download."""
1585 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1587 def report_extraction(self, video_id):
1588 """Report information extraction."""
1589 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1591 def _real_initialize(self):
1594 def _real_extract(self, url):
1595 # Extract id from URL
1596 mobj = re.match(self._VALID_URL, url)
1598 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1601 # At this point we have a new video
1602 self._downloader.increment_downloads()
1603 video_id = mobj.group(1)
1605 video_extension = 'mp4'
1607 # Retrieve video webpage to extract further information
1608 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1610 self.report_download_webpage(video_id)
1611 webpage = urllib2.urlopen(request).read()
1612 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1613 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1616 # Extract URL, uploader, and title from webpage
1617 self.report_extraction(video_id)
1618 mobj = re.search(r"download_url:'([^']+)'", webpage)
1620 video_extension = 'flv'
1621 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1623 self._downloader.trouble(u'ERROR: unable to extract media URL')
1625 mediaURL = urllib.unquote(mobj.group(1))
1626 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1627 mediaURL = mediaURL.replace('\\x26', '\x26')
1629 video_url = mediaURL
1631 mobj = re.search(r'<title>(.*)</title>', webpage)
1633 self._downloader.trouble(u'ERROR: unable to extract title')
1635 video_title = mobj.group(1).decode('utf-8')
1636 video_title = sanitize_title(video_title)
1637 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1639 # Extract video description
1640 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1642 self._downloader.trouble(u'ERROR: unable to extract video description')
1644 video_description = mobj.group(1).decode('utf-8')
1645 if not video_description:
1646 video_description = 'No description available.'
1648 # Extract video thumbnail
1649 if self._downloader.params.get('forcethumbnail', False):
1650 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1652 webpage = urllib2.urlopen(request).read()
1653 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1654 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1656 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1658 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1660 video_thumbnail = mobj.group(1)
1661 else: # we need something to pass to process_info
1662 video_thumbnail = ''
1665 # Process video information
1666 self._downloader.process_info({
1667 'id': video_id.decode('utf-8'),
1668 'url': video_url.decode('utf-8'),
1670 'upload_date': u'NA',
1671 'title': video_title,
1672 'stitle': simple_title,
1673 'ext': video_extension.decode('utf-8'),
1677 except UnavailableVideoError:
1678 self._downloader.trouble(u'\nERROR: unable to download video')
1681 class PhotobucketIE(InfoExtractor):
1682 """Information extractor for photobucket.com."""
1684 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1686 def __init__(self, downloader=None):
1687 InfoExtractor.__init__(self, downloader)
1691 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1693 def report_download_webpage(self, video_id):
1694 """Report webpage download."""
1695 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1697 def report_extraction(self, video_id):
1698 """Report information extraction."""
1699 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1701 def _real_initialize(self):
1704 def _real_extract(self, url):
1705 # Extract id from URL
1706 mobj = re.match(self._VALID_URL, url)
1708 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1711 # At this point we have a new video
1712 self._downloader.increment_downloads()
1713 video_id = mobj.group(1)
1715 video_extension = 'flv'
1717 # Retrieve video webpage to extract further information
1718 request = urllib2.Request(url)
1720 self.report_download_webpage(video_id)
1721 webpage = urllib2.urlopen(request).read()
1722 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1723 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1726 # Extract URL, uploader, and title from webpage
1727 self.report_extraction(video_id)
1728 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1730 self._downloader.trouble(u'ERROR: unable to extract media URL')
1732 mediaURL = urllib.unquote(mobj.group(1))
1734 video_url = mediaURL
1736 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1738 self._downloader.trouble(u'ERROR: unable to extract title')
1740 video_title = mobj.group(1).decode('utf-8')
1741 video_title = sanitize_title(video_title)
1742 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1744 video_uploader = mobj.group(2).decode('utf-8')
1747 # Process video information
1748 self._downloader.process_info({
1749 'id': video_id.decode('utf-8'),
1750 'url': video_url.decode('utf-8'),
1751 'uploader': video_uploader,
1752 'upload_date': u'NA',
1753 'title': video_title,
1754 'stitle': simple_title,
1755 'ext': video_extension.decode('utf-8'),
1759 except UnavailableVideoError:
1760 self._downloader.trouble(u'\nERROR: unable to download video')
1763 class YahooIE(InfoExtractor):
1764 """Information extractor for video.yahoo.com."""
1766 # _VALID_URL matches all Yahoo! Video URLs
1767 # _VPAGE_URL matches only the extractable '/watch/' URLs
1768 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1769 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1771 def __init__(self, downloader=None):
1772 InfoExtractor.__init__(self, downloader)
1776 return (re.match(YahooIE._VALID_URL, url) is not None)
1778 def report_download_webpage(self, video_id):
1779 """Report webpage download."""
1780 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1782 def report_extraction(self, video_id):
1783 """Report information extraction."""
1784 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1786 def _real_initialize(self):
1789 def _real_extract(self, url, new_video=True):
1790 # Extract ID from URL
1791 mobj = re.match(self._VALID_URL, url)
1793 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1796 # At this point we have a new video
1797 self._downloader.increment_downloads()
1798 video_id = mobj.group(2)
1799 video_extension = 'flv'
1801 # Rewrite valid but non-extractable URLs as
1802 # extractable English language /watch/ URLs
1803 if re.match(self._VPAGE_URL, url) is None:
1804 request = urllib2.Request(url)
1806 webpage = urllib2.urlopen(request).read()
1807 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1808 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1811 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1813 self._downloader.trouble(u'ERROR: Unable to extract id field')
1815 yahoo_id = mobj.group(1)
1817 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1819 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1821 yahoo_vid = mobj.group(1)
1823 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1824 return self._real_extract(url, new_video=False)
1826 # Retrieve video webpage to extract further information
1827 request = urllib2.Request(url)
1829 self.report_download_webpage(video_id)
1830 webpage = urllib2.urlopen(request).read()
1831 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1832 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1835 # Extract uploader and title from webpage
1836 self.report_extraction(video_id)
1837 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1839 self._downloader.trouble(u'ERROR: unable to extract video title')
1841 video_title = mobj.group(1).decode('utf-8')
1842 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1844 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1846 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1848 video_uploader = mobj.group(1).decode('utf-8')
1850 # Extract video thumbnail
1851 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1853 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1855 video_thumbnail = mobj.group(1).decode('utf-8')
1857 # Extract video description
1858 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1860 self._downloader.trouble(u'ERROR: unable to extract video description')
1862 video_description = mobj.group(1).decode('utf-8')
1863 if not video_description:
1864 video_description = 'No description available.'
1866 # Extract video height and width
1867 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1869 self._downloader.trouble(u'ERROR: unable to extract video height')
1871 yv_video_height = mobj.group(1)
1873 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1875 self._downloader.trouble(u'ERROR: unable to extract video width')
1877 yv_video_width = mobj.group(1)
1879 # Retrieve video playlist to extract media URL
1880 # I'm not completely sure what all these options are, but we
1881 # seem to need most of them, otherwise the server sends a 401.
1882 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1883 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1884 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1885 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1886 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1888 self.report_download_webpage(video_id)
1889 webpage = urllib2.urlopen(request).read()
1890 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1891 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1894 # Extract media URL from playlist XML
1895 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1897 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1899 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1900 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1903 # Process video information
1904 self._downloader.process_info({
1905 'id': video_id.decode('utf-8'),
1907 'uploader': video_uploader,
1908 'upload_date': u'NA',
1909 'title': video_title,
1910 'stitle': simple_title,
1911 'ext': video_extension.decode('utf-8'),
1912 'thumbnail': video_thumbnail.decode('utf-8'),
1913 'description': video_description,
1914 'thumbnail': video_thumbnail,
1915 'description': video_description,
1918 except UnavailableVideoError:
1919 self._downloader.trouble(u'\nERROR: unable to download video')
1922 class VimeoIE(InfoExtractor):
1923 """Information extractor for vimeo.com."""
1925 # _VALID_URL matches Vimeo URLs
1926 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1928 def __init__(self, downloader=None):
1929 InfoExtractor.__init__(self, downloader)
1933 return (re.match(VimeoIE._VALID_URL, url) is not None)
1935 def report_download_webpage(self, video_id):
1936 """Report webpage download."""
1937 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1939 def report_extraction(self, video_id):
1940 """Report information extraction."""
1941 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1943 def _real_initialize(self):
1946 def _real_extract(self, url, new_video=True):
1947 # Extract ID from URL
1948 mobj = re.match(self._VALID_URL, url)
1950 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1953 # At this point we have a new video
1954 self._downloader.increment_downloads()
1955 video_id = mobj.group(1)
1957 # Retrieve video webpage to extract further information
1958 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1960 self.report_download_webpage(video_id)
1961 webpage = urllib2.urlopen(request).read()
1962 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1963 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1966 # Now we begin extracting as much information as we can from what we
1967 # retrieved. First we extract the information common to all extractors,
1968 # and latter we extract those that are Vimeo specific.
1969 self.report_extraction(video_id)
1972 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1974 self._downloader.trouble(u'ERROR: unable to extract video title')
1976 video_title = mobj.group(1).decode('utf-8')
1977 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1980 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1982 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1984 video_uploader = mobj.group(1).decode('utf-8')
1986 # Extract video thumbnail
1987 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1989 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1991 video_thumbnail = mobj.group(1).decode('utf-8')
1993 # # Extract video description
1994 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1996 # self._downloader.trouble(u'ERROR: unable to extract video description')
1998 # video_description = mobj.group(1).decode('utf-8')
1999 # if not video_description: video_description = 'No description available.'
2000 video_description = 'Foo.'
2002 # Vimeo specific: extract request signature
2003 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2005 self._downloader.trouble(u'ERROR: unable to extract request signature')
2007 sig = mobj.group(1).decode('utf-8')
2009 # Vimeo specific: Extract request signature expiration
2010 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2012 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2014 sig_exp = mobj.group(1).decode('utf-8')
2016 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2019 # Process video information
2020 self._downloader.process_info({
2021 'id': video_id.decode('utf-8'),
2023 'uploader': video_uploader,
2024 'upload_date': u'NA',
2025 'title': video_title,
2026 'stitle': simple_title,
2028 'thumbnail': video_thumbnail.decode('utf-8'),
2029 'description': video_description,
2030 'thumbnail': video_thumbnail,
2031 'description': video_description,
2034 except UnavailableVideoError:
2035 self._downloader.trouble(u'ERROR: unable to download video')
2038 class GenericIE(InfoExtractor):
2039 """Generic last-resort information extractor."""
2041 def __init__(self, downloader=None):
2042 InfoExtractor.__init__(self, downloader)
2048 def report_download_webpage(self, video_id):
2049 """Report webpage download."""
2050 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2051 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2053 def report_extraction(self, video_id):
2054 """Report information extraction."""
2055 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2057 def _real_initialize(self):
2060 def _real_extract(self, url):
2061 # At this point we have a new video
2062 self._downloader.increment_downloads()
2064 video_id = url.split('/')[-1]
2065 request = urllib2.Request(url)
2067 self.report_download_webpage(video_id)
2068 webpage = urllib2.urlopen(request).read()
2069 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2070 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2072 except ValueError, err:
2073 # since this is the last-resort InfoExtractor, if
2074 # this error is thrown, it'll be thrown here
2075 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2078 self.report_extraction(video_id)
2079 # Start with something easy: JW Player in SWFObject
2080 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2082 # Broaden the search a little bit
2083 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2085 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2088 # It's possible that one of the regexes
2089 # matched, but returned an empty group:
2090 if mobj.group(1) is None:
2091 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2094 video_url = urllib.unquote(mobj.group(1))
2095 video_id = os.path.basename(video_url)
2097 # here's a fun little line of code for you:
2098 video_extension = os.path.splitext(video_id)[1][1:]
2099 video_id = os.path.splitext(video_id)[0]
2101 # it's tempting to parse this further, but you would
2102 # have to take into account all the variations like
2103 # Video Title - Site Name
2104 # Site Name | Video Title
2105 # Video Title - Tagline | Site Name
2106 # and so on and so forth; it's just not practical
2107 mobj = re.search(r'<title>(.*)</title>', webpage)
2109 self._downloader.trouble(u'ERROR: unable to extract title')
2111 video_title = mobj.group(1).decode('utf-8')
2112 video_title = sanitize_title(video_title)
2113 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2115 # video uploader is domain name
2116 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2118 self._downloader.trouble(u'ERROR: unable to extract title')
2120 video_uploader = mobj.group(1).decode('utf-8')
2123 # Process video information
2124 self._downloader.process_info({
2125 'id': video_id.decode('utf-8'),
2126 'url': video_url.decode('utf-8'),
2127 'uploader': video_uploader,
2128 'upload_date': u'NA',
2129 'title': video_title,
2130 'stitle': simple_title,
2131 'ext': video_extension.decode('utf-8'),
2135 except UnavailableVideoError, err:
2136 self._downloader.trouble(u'\nERROR: unable to download video')
2139 class YoutubeSearchIE(InfoExtractor):
2140 """Information Extractor for YouTube search queries."""
2141 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2142 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2143 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2144 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2146 _max_youtube_results = 1000
2148 def __init__(self, youtube_ie, downloader=None):
2149 InfoExtractor.__init__(self, downloader)
2150 self._youtube_ie = youtube_ie
2154 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2156 def report_download_page(self, query, pagenum):
2157 """Report attempt to download playlist page with given number."""
2158 query = query.decode(preferredencoding())
2159 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2161 def _real_initialize(self):
2162 self._youtube_ie.initialize()
2164 def _real_extract(self, query):
2165 mobj = re.match(self._VALID_QUERY, query)
2167 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2170 prefix, query = query.split(':')
2172 query = query.encode('utf-8')
2174 self._download_n_results(query, 1)
2176 elif prefix == 'all':
2177 self._download_n_results(query, self._max_youtube_results)
2183 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2185 elif n > self._max_youtube_results:
2186 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2187 n = self._max_youtube_results
2188 self._download_n_results(query, n)
2190 except ValueError: # parsing prefix as integer fails
2191 self._download_n_results(query, 1)
2194 def _download_n_results(self, query, n):
2195 """Downloads a specified number of results for a query"""
2198 already_seen = set()
2202 self.report_download_page(query, pagenum)
2203 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2204 request = urllib2.Request(result_url)
2206 page = urllib2.urlopen(request).read()
2207 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2208 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2211 # Extract video identifiers
2212 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2213 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2214 if video_id not in already_seen:
2215 video_ids.append(video_id)
2216 already_seen.add(video_id)
2217 if len(video_ids) == n:
2218 # Specified n videos reached
2219 for id in video_ids:
2220 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2223 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2224 for id in video_ids:
2225 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2228 pagenum = pagenum + 1
2231 class GoogleSearchIE(InfoExtractor):
2232 """Information Extractor for Google Video search queries."""
2233 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2234 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2235 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2236 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2238 _max_google_results = 1000
2240 def __init__(self, google_ie, downloader=None):
2241 InfoExtractor.__init__(self, downloader)
2242 self._google_ie = google_ie
2246 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2248 def report_download_page(self, query, pagenum):
2249 """Report attempt to download playlist page with given number."""
2250 query = query.decode(preferredencoding())
2251 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2253 def _real_initialize(self):
2254 self._google_ie.initialize()
2256 def _real_extract(self, query):
2257 mobj = re.match(self._VALID_QUERY, query)
2259 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2262 prefix, query = query.split(':')
2264 query = query.encode('utf-8')
2266 self._download_n_results(query, 1)
2268 elif prefix == 'all':
2269 self._download_n_results(query, self._max_google_results)
2275 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2277 elif n > self._max_google_results:
2278 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2279 n = self._max_google_results
2280 self._download_n_results(query, n)
2282 except ValueError: # parsing prefix as integer fails
2283 self._download_n_results(query, 1)
2286 def _download_n_results(self, query, n):
2287 """Downloads a specified number of results for a query"""
2290 already_seen = set()
2294 self.report_download_page(query, pagenum)
2295 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2296 request = urllib2.Request(result_url)
2298 page = urllib2.urlopen(request).read()
2299 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2300 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2303 # Extract video identifiers
2304 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2305 video_id = mobj.group(1)
2306 if video_id not in already_seen:
2307 video_ids.append(video_id)
2308 already_seen.add(video_id)
2309 if len(video_ids) == n:
2310 # Specified n videos reached
2311 for id in video_ids:
2312 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2315 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2316 for id in video_ids:
2317 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2320 pagenum = pagenum + 1
2323 class YahooSearchIE(InfoExtractor):
2324 """Information Extractor for Yahoo! Video search queries."""
2325 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2326 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2327 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2328 _MORE_PAGES_INDICATOR = r'\s*Next'
2330 _max_yahoo_results = 1000
2332 def __init__(self, yahoo_ie, downloader=None):
2333 InfoExtractor.__init__(self, downloader)
2334 self._yahoo_ie = yahoo_ie
2338 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2340 def report_download_page(self, query, pagenum):
2341 """Report attempt to download playlist page with given number."""
2342 query = query.decode(preferredencoding())
2343 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2345 def _real_initialize(self):
2346 self._yahoo_ie.initialize()
2348 def _real_extract(self, query):
2349 mobj = re.match(self._VALID_QUERY, query)
2351 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2354 prefix, query = query.split(':')
2356 query = query.encode('utf-8')
2358 self._download_n_results(query, 1)
2360 elif prefix == 'all':
2361 self._download_n_results(query, self._max_yahoo_results)
2367 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2369 elif n > self._max_yahoo_results:
2370 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2371 n = self._max_yahoo_results
2372 self._download_n_results(query, n)
2374 except ValueError: # parsing prefix as integer fails
2375 self._download_n_results(query, 1)
2378 def _download_n_results(self, query, n):
2379 """Downloads a specified number of results for a query"""
2382 already_seen = set()
2386 self.report_download_page(query, pagenum)
2387 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2388 request = urllib2.Request(result_url)
2390 page = urllib2.urlopen(request).read()
2391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2392 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2395 # Extract video identifiers
2396 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2397 video_id = mobj.group(1)
2398 if video_id not in already_seen:
2399 video_ids.append(video_id)
2400 already_seen.add(video_id)
2401 if len(video_ids) == n:
2402 # Specified n videos reached
2403 for id in video_ids:
2404 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2407 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2408 for id in video_ids:
2409 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2412 pagenum = pagenum + 1
2415 class YoutubePlaylistIE(InfoExtractor):
2416 """Information Extractor for YouTube playlists."""
2418 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2419 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2420 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2421 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2424 def __init__(self, youtube_ie, downloader=None):
2425 InfoExtractor.__init__(self, downloader)
2426 self._youtube_ie = youtube_ie
2430 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2432 def report_download_page(self, playlist_id, pagenum):
2433 """Report attempt to download playlist page with given number."""
2434 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2436 def _real_initialize(self):
2437 self._youtube_ie.initialize()
2439 def _real_extract(self, url):
2440 # Extract playlist id
2441 mobj = re.match(self._VALID_URL, url)
2443 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2447 if mobj.group(3) is not None:
2448 self._youtube_ie.extract(mobj.group(3))
2451 # Download playlist pages
2452 # prefix is 'p' as default for playlists but there are other types that need extra care
2453 playlist_prefix = mobj.group(1)
2454 if playlist_prefix == 'a':
2455 playlist_access = 'artist'
2457 playlist_prefix = 'p'
2458 playlist_access = 'view_play_list'
2459 playlist_id = mobj.group(2)
2464 self.report_download_page(playlist_id, pagenum)
2465 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2467 page = urllib2.urlopen(request).read()
2468 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2469 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2472 # Extract video identifiers
2474 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2475 if mobj.group(1) not in ids_in_page:
2476 ids_in_page.append(mobj.group(1))
2477 video_ids.extend(ids_in_page)
2479 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2481 pagenum = pagenum + 1
2483 playliststart = self._downloader.params.get('playliststart', 1) - 1
2484 playlistend = self._downloader.params.get('playlistend', -1)
2485 video_ids = video_ids[playliststart:playlistend]
2487 for id in video_ids:
2488 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2492 class YoutubeUserIE(InfoExtractor):
2493 """Information Extractor for YouTube users."""
2495 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2496 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2497 _GDATA_PAGE_SIZE = 50
2498 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2499 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2502 def __init__(self, youtube_ie, downloader=None):
2503 InfoExtractor.__init__(self, downloader)
2504 self._youtube_ie = youtube_ie
2508 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2510 def report_download_page(self, username, start_index):
2511 """Report attempt to download user page."""
2512 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2513 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2515 def _real_initialize(self):
2516 self._youtube_ie.initialize()
2518 def _real_extract(self, url):
2520 mobj = re.match(self._VALID_URL, url)
2522 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2525 username = mobj.group(1)
2527 # Download video ids using YouTube Data API. Result size per
2528 # query is limited (currently to 50 videos) so we need to query
2529 # page by page until there are no video ids - it means we got
2536 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2537 self.report_download_page(username, start_index)
2539 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2542 page = urllib2.urlopen(request).read()
2543 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2544 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2547 # Extract video identifiers
2550 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2551 if mobj.group(1) not in ids_in_page:
2552 ids_in_page.append(mobj.group(1))
2554 video_ids.extend(ids_in_page)
2556 # A little optimization - if current page is not
2557 # "full", ie. does not contain PAGE_SIZE video ids then
2558 # we can assume that this page is the last one - there
2559 # are no more ids on further pages - no need to query
2562 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2567 all_ids_count = len(video_ids)
2568 playliststart = self._downloader.params.get('playliststart', 1) - 1
2569 playlistend = self._downloader.params.get('playlistend', -1)
2571 if playlistend == -1:
2572 video_ids = video_ids[playliststart:]
2574 video_ids = video_ids[playliststart:playlistend]
2576 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2577 (username, all_ids_count, len(video_ids)))
2579 for video_id in video_ids:
2580 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2583 class DepositFilesIE(InfoExtractor):
2584 """Information extractor for depositfiles.com"""
2586 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2588 def __init__(self, downloader=None):
2589 InfoExtractor.__init__(self, downloader)
2593 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2595 def report_download_webpage(self, file_id):
2596 """Report webpage download."""
2597 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2599 def report_extraction(self, file_id):
2600 """Report information extraction."""
2601 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2603 def _real_initialize(self):
2606 def _real_extract(self, url):
2607 # At this point we have a new file
2608 self._downloader.increment_downloads()
2610 file_id = url.split('/')[-1]
2611 # Rebuild url in english locale
2612 url = 'http://depositfiles.com/en/files/' + file_id
2614 # Retrieve file webpage with 'Free download' button pressed
2615 free_download_indication = { 'gateway_result' : '1' }
2616 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2618 self.report_download_webpage(file_id)
2619 webpage = urllib2.urlopen(request).read()
2620 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2621 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2624 # Search for the real file URL
2625 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2626 if (mobj is None) or (mobj.group(1) is None):
2627 # Try to figure out reason of the error.
2628 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2629 if (mobj is not None) and (mobj.group(1) is not None):
2630 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2631 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2633 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2636 file_url = mobj.group(1)
2637 file_extension = os.path.splitext(file_url)[1][1:]
2639 # Search for file title
2640 mobj = re.search(r'<b title="(.*?)">', webpage)
2642 self._downloader.trouble(u'ERROR: unable to extract title')
2644 file_title = mobj.group(1).decode('utf-8')
2647 # Process file information
2648 self._downloader.process_info({
2649 'id': file_id.decode('utf-8'),
2650 'url': file_url.decode('utf-8'),
2652 'upload_date': u'NA',
2653 'title': file_title,
2654 'stitle': file_title,
2655 'ext': file_extension.decode('utf-8'),
2659 except UnavailableVideoError, err:
2660 self._downloader.trouble(u'ERROR: unable to download file')
2663 class FacebookIE(InfoExtractor):
2664 """Information Extractor for Facebook"""
2666 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2667 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2668 _NETRC_MACHINE = 'facebook'
2669 _available_formats = ['highqual', 'lowqual']
2670 _video_extensions = {
2675 def __init__(self, downloader=None):
2676 InfoExtractor.__init__(self, downloader)
2680 return (re.match(FacebookIE._VALID_URL, url) is not None)
2682 def _reporter(self, message):
2683 """Add header and report message."""
2684 self._downloader.to_screen(u'[facebook] %s' % message)
2686 def report_login(self):
2687 """Report attempt to log in."""
2688 self._reporter(u'Logging in')
2690 def report_video_webpage_download(self, video_id):
2691 """Report attempt to download video webpage."""
2692 self._reporter(u'%s: Downloading video webpage' % video_id)
2694 def report_information_extraction(self, video_id):
2695 """Report attempt to extract video information."""
2696 self._reporter(u'%s: Extracting video information' % video_id)
2698 def _parse_page(self, video_webpage):
2699 """Extract video information from page"""
2701 data = {'title': r'class="video_title datawrap">(.*?)</',
2702 'description': r'<div class="datawrap">(.*?)</div>',
2703 'owner': r'\("video_owner_name", "(.*?)"\)',
2704 'upload_date': r'data-date="(.*?)"',
2705 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2708 for piece in data.keys():
2709 mobj = re.search(data[piece], video_webpage)
2710 if mobj is not None:
2711 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2715 for fmt in self._available_formats:
2716 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2717 if mobj is not None:
2718 # URL is in a Javascript segment inside an escaped Unicode format within
2719 # the generally utf-8 page
2720 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2721 video_info['video_urls'] = video_urls
2725 def _real_initialize(self):
2726 if self._downloader is None:
2731 downloader_params = self._downloader.params
2733 # Attempt to use provided username and password or .netrc data
2734 if downloader_params.get('username', None) is not None:
2735 useremail = downloader_params['username']
2736 password = downloader_params['password']
2737 elif downloader_params.get('usenetrc', False):
2739 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2740 if info is not None:
2744 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2745 except (IOError, netrc.NetrcParseError), err:
2746 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2749 if useremail is None:
2758 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2761 login_results = urllib2.urlopen(request).read()
2762 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2763 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2765 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2766 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2769 def _real_extract(self, url):
2770 mobj = re.match(self._VALID_URL, url)
2772 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2774 video_id = mobj.group('ID')
2777 self.report_video_webpage_download(video_id)
2778 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2780 page = urllib2.urlopen(request)
2781 video_webpage = page.read()
2782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2783 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2786 # Start extracting information
2787 self.report_information_extraction(video_id)
2789 # Extract information
2790 video_info = self._parse_page(video_webpage)
2793 if 'owner' not in video_info:
2794 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2796 video_uploader = video_info['owner']
2799 if 'title' not in video_info:
2800 self._downloader.trouble(u'ERROR: unable to extract video title')
2802 video_title = video_info['title']
2803 video_title = video_title.decode('utf-8')
2804 video_title = sanitize_title(video_title)
2807 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2808 simple_title = simple_title.strip(ur'_')
2811 if 'thumbnail' not in video_info:
2812 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2813 video_thumbnail = ''
2815 video_thumbnail = video_info['thumbnail']
2819 if 'upload_date' in video_info:
2820 upload_time = video_info['upload_date']
2821 timetuple = email.utils.parsedate_tz(upload_time)
2822 if timetuple is not None:
2824 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2829 video_description = video_info.get('description', 'No description available.')
2831 url_map = video_info['video_urls']
2832 if len(url_map.keys()) > 0:
2833 # Decide which formats to download
2834 req_format = self._downloader.params.get('format', None)
2835 format_limit = self._downloader.params.get('format_limit', None)
2837 if format_limit is not None and format_limit in self._available_formats:
2838 format_list = self._available_formats[self._available_formats.index(format_limit):]
2840 format_list = self._available_formats
2841 existing_formats = [x for x in format_list if x in url_map]
2842 if len(existing_formats) == 0:
2843 self._downloader.trouble(u'ERROR: no known formats available for video')
2845 if req_format is None:
2846 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2847 elif req_format == '-1':
2848 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2851 if req_format not in url_map:
2852 self._downloader.trouble(u'ERROR: requested format not available')
2854 video_url_list = [(req_format, url_map[req_format])] # Specific format
2856 for format_param, video_real_url in video_url_list:
2858 # At this point we have a new video
2859 self._downloader.increment_downloads()
2862 video_extension = self._video_extensions.get(format_param, 'mp4')
2865 # Process video information
2866 self._downloader.process_info({
2867 'id': video_id.decode('utf-8'),
2868 'url': video_real_url.decode('utf-8'),
2869 'uploader': video_uploader.decode('utf-8'),
2870 'upload_date': upload_date,
2871 'title': video_title,
2872 'stitle': simple_title,
2873 'ext': video_extension.decode('utf-8'),
2874 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2875 'thumbnail': video_thumbnail.decode('utf-8'),
2876 'description': video_description.decode('utf-8'),
2879 except UnavailableVideoError, err:
2880 self._downloader.trouble(u'\nERROR: unable to download video')
2882 class BlipTVIE(InfoExtractor):
2883 """Information extractor for blip.tv"""
2885 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2886 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2890 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2892 def report_extraction(self, file_id):
2893 """Report information extraction."""
2894 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2896 def _simplify_title(self, title):
2897 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2898 res = res.strip(ur'_')
2901 def _real_extract(self, url):
2902 mobj = re.match(self._VALID_URL, url)
2904 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2911 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2912 request = urllib2.Request(json_url)
2913 self.report_extraction(mobj.group(1))
2915 json_code = urllib2.urlopen(request).read()
2916 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2917 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2920 json_data = json.loads(json_code)
2921 if 'Post' in json_data:
2922 data = json_data['Post']
2926 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2927 video_url = data['media']['url']
2928 umobj = re.match(self._URL_EXT, video_url)
2930 raise ValueError('Can not determine filename extension')
2931 ext = umobj.group(1)
2933 self._downloader.increment_downloads()
2936 'id': data['item_id'],
2938 'uploader': data['display_name'],
2939 'upload_date': upload_date,
2940 'title': data['title'],
2941 'stitle': self._simplify_title(data['title']),
2943 'format': data['media']['mimeType'],
2944 'thumbnail': data['thumbnailUrl'],
2945 'description': data['description'],
2946 'player_url': data['embedUrl']
2948 except (ValueError,KeyError), err:
2949 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2953 self._downloader.process_info(info)
2954 except UnavailableVideoError, err:
2955 self._downloader.trouble(u'\nERROR: unable to download video')
2958 class MyVideoIE(InfoExtractor):
2959 """Information Extractor for myvideo.de."""
2961 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2963 def __init__(self, downloader=None):
2964 InfoExtractor.__init__(self, downloader)
2968 return (re.match(MyVideoIE._VALID_URL, url) is not None)
2970 def report_download_webpage(self, video_id):
2971 """Report webpage download."""
2972 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2974 def report_extraction(self, video_id):
2975 """Report information extraction."""
2976 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2978 def _real_initialize(self):
2981 def _real_extract(self,url):
2982 mobj = re.match(self._VALID_URL, url)
2984 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2987 video_id = mobj.group(1)
2988 simple_title = mobj.group(2).decode('utf-8')
2989 # should actually not be necessary
2990 simple_title = sanitize_title(simple_title)
2991 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2994 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2996 self.report_download_webpage(video_id)
2997 webpage = urllib2.urlopen(request).read()
2998 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2999 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3002 self.report_extraction(video_id)
3003 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3006 self._downloader.trouble(u'ERROR: unable to extract media URL')
3008 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3010 mobj = re.search('<title>([^<]+)</title>', webpage)
3012 self._downloader.trouble(u'ERROR: unable to extract title')
3015 video_title = mobj.group(1)
3016 video_title = sanitize_title(video_title)
3020 self._downloader.process_info({
3024 'upload_date': u'NA',
3025 'title': video_title,
3026 'stitle': simple_title,
3031 except UnavailableVideoError:
3032 self._downloader.trouble(u'\nERROR: Unable to download video')
3034 class PostProcessor(object):
3035 """Post Processor class.
3037 PostProcessor objects can be added to downloaders with their
3038 add_post_processor() method. When the downloader has finished a
3039 successful download, it will take its internal chain of PostProcessors
3040 and start calling the run() method on each one of them, first with
3041 an initial argument and then with the returned value of the previous
3044 The chain will be stopped if one of them ever returns None or the end
3045 of the chain is reached.
3047 PostProcessor objects follow a "mutual registration" process similar
3048 to InfoExtractor objects.
3053 def __init__(self, downloader=None):
3054 self._downloader = downloader
3056 def set_downloader(self, downloader):
3057 """Sets the downloader for this PP."""
3058 self._downloader = downloader
3060 def run(self, information):
3061 """Run the PostProcessor.
3063 The "information" argument is a dictionary like the ones
3064 composed by InfoExtractors. The only difference is that this
3065 one has an extra field called "filepath" that points to the
3068 When this method returns None, the postprocessing chain is
3069 stopped. However, this method may return an information
3070 dictionary that will be passed to the next postprocessing
3071 object in the chain. It can be the one it received after
3072 changing some fields.
3074 In addition, this method may raise a PostProcessingError
3075 exception that will be taken into account by the downloader
3078 return information # by default, do nothing
3081 class FFmpegExtractAudioPP(PostProcessor):
3083 def __init__(self, downloader=None, preferredcodec=None):
3084 PostProcessor.__init__(self, downloader)
3085 if preferredcodec is None:
3086 preferredcodec = 'best'
3087 self._preferredcodec = preferredcodec
3090 def get_audio_codec(path):
3092 cmd = ['ffprobe', '-show_streams', '--', path]
3093 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3094 output = handle.communicate()[0]
3095 if handle.wait() != 0:
3097 except (IOError, OSError):
3100 for line in output.split('\n'):
3101 if line.startswith('codec_name='):
3102 audio_codec = line.split('=')[1].strip()
3103 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3108 def run_ffmpeg(path, out_path, codec, more_opts):
3110 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3111 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3113 except (IOError, OSError):
3116 def run(self, information):
3117 path = information['filepath']
3119 filecodec = self.get_audio_codec(path)
3120 if filecodec is None:
3121 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3125 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3126 if filecodec == 'aac' or filecodec == 'mp3':
3127 # Lossless if possible
3129 extension = filecodec
3130 if filecodec == 'aac':
3131 more_opts = ['-f', 'adts']
3134 acodec = 'libmp3lame'
3136 more_opts = ['-ab', '128k']
3138 # We convert the audio (lossy)
3139 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3140 extension = self._preferredcodec
3141 more_opts = ['-ab', '128k']
3142 if self._preferredcodec == 'aac':
3143 more_opts += ['-f', 'adts']
3145 (prefix, ext) = os.path.splitext(path)
3146 new_path = prefix + '.' + extension
3147 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3148 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3151 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3156 except (IOError, OSError):
3157 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3160 information['filepath'] = new_path
3164 def updateSelf(downloader, filename):
3165 ''' Update the program file with the latest version from the repository '''
3166 # Note: downloader only used for options
3167 if not os.access(filename, os.W_OK):
3168 sys.exit('ERROR: no write permissions on %s' % filename)
3170 downloader.to_screen('Updating to latest version...')
3174 urlh = urllib.urlopen(UPDATE_URL)
3175 newcontent = urlh.read()
3178 except (IOError, OSError), err:
3179 sys.exit('ERROR: unable to download latest version')
3182 outf = open(filename, 'wb')
3184 outf.write(newcontent)
3187 except (IOError, OSError), err:
3188 sys.exit('ERROR: unable to overwrite current version')
3190 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3197 def _format_option_string(option):
3198 ''' ('-o', '--option') -> -o, --format METAVAR'''
3202 if option._short_opts: opts.append(option._short_opts[0])
3203 if option._long_opts: opts.append(option._long_opts[0])
3204 if len(opts) > 1: opts.insert(1, ', ')
3206 if option.takes_value(): opts.append(' %s' % option.metavar)
3208 return "".join(opts)
3210 def _find_term_columns():
3211 columns = os.environ.get('COLUMNS', None)
3216 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3217 out,err = sp.communicate()
3218 return int(out.split()[1])
3224 max_help_position = 80
3226 # No need to wrap help messages if we're on a wide console
3227 columns = _find_term_columns()
3228 if columns: max_width = columns
3230 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3231 fmt.format_option_strings = _format_option_string
3234 'version' : __version__,
3236 'usage' : '%prog [options] url...',
3237 'conflict_handler' : 'resolve',
3240 parser = optparse.OptionParser(**kw)
3243 general = optparse.OptionGroup(parser, 'General Options')
3244 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3245 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3246 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3247 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3248 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3250 general.add_option('-h', '--help',
3251 action='help', help='print this help text and exit')
3252 general.add_option('-v', '--version',
3253 action='version', help='print program version and exit')
3254 general.add_option('-U', '--update',
3255 action='store_true', dest='update_self', help='update this program to latest version')
3256 general.add_option('-i', '--ignore-errors',
3257 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3258 general.add_option('-r', '--rate-limit',
3259 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3260 general.add_option('-R', '--retries',
3261 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3262 general.add_option('--playlist-start',
3263 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3264 general.add_option('--playlist-end',
3265 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3266 general.add_option('--dump-user-agent',
3267 action='store_true', dest='dump_user_agent',
3268 help='display the current browser identification', default=False)
3270 authentication.add_option('-u', '--username',
3271 dest='username', metavar='USERNAME', help='account username')
3272 authentication.add_option('-p', '--password',
3273 dest='password', metavar='PASSWORD', help='account password')
3274 authentication.add_option('-n', '--netrc',
3275 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3278 video_format.add_option('-f', '--format',
3279 action='store', dest='format', metavar='FORMAT', help='video format code')
3280 video_format.add_option('--all-formats',
3281 action='store_const', dest='format', help='download all available video formats', const='-1')
3282 video_format.add_option('--max-quality',
3283 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3286 verbosity.add_option('-q', '--quiet',
3287 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3288 verbosity.add_option('-s', '--simulate',
3289 action='store_true', dest='simulate', help='do not download video', default=False)
3290 verbosity.add_option('-g', '--get-url',
3291 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3292 verbosity.add_option('-e', '--get-title',
3293 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3294 verbosity.add_option('--get-thumbnail',
3295 action='store_true', dest='getthumbnail',
3296 help='simulate, quiet but print thumbnail URL', default=False)
3297 verbosity.add_option('--get-description',
3298 action='store_true', dest='getdescription',
3299 help='simulate, quiet but print video description', default=False)
3300 verbosity.add_option('--get-filename',
3301 action='store_true', dest='getfilename',
3302 help='simulate, quiet but print output filename', default=False)
3303 verbosity.add_option('--no-progress',
3304 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3305 verbosity.add_option('--console-title',
3306 action='store_true', dest='consoletitle',
3307 help='display progress in console titlebar', default=False)
3310 filesystem.add_option('-t', '--title',
3311 action='store_true', dest='usetitle', help='use title in file name', default=False)
3312 filesystem.add_option('-l', '--literal',
3313 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3314 filesystem.add_option('-A', '--auto-number',
3315 action='store_true', dest='autonumber',
3316 help='number downloaded files starting from 00000', default=False)
3317 filesystem.add_option('-o', '--output',
3318 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3319 filesystem.add_option('-a', '--batch-file',
3320 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3321 filesystem.add_option('-w', '--no-overwrites',
3322 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3323 filesystem.add_option('-c', '--continue',
3324 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3325 filesystem.add_option('--cookies',
3326 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3327 filesystem.add_option('--no-part',
3328 action='store_true', dest='nopart', help='do not use .part files', default=False)
3329 filesystem.add_option('--no-mtime',
3330 action='store_false', dest='updatetime',
3331 help='do not use the Last-modified header to set the file modification time', default=True)
3332 filesystem.add_option('--write-description',
3333 action='store_true', dest='writedescription',
3334 help='write video description to a .description file', default=False)
3335 filesystem.add_option('--write-info-json',
3336 action='store_true', dest='writeinfojson',
3337 help='write video metadata to a .info.json file', default=False)
3340 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3341 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3342 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3343 help='"best", "aac" or "mp3"; best by default')
3346 parser.add_option_group(general)
3347 parser.add_option_group(filesystem)
3348 parser.add_option_group(verbosity)
3349 parser.add_option_group(video_format)
3350 parser.add_option_group(authentication)
3351 parser.add_option_group(postproc)
3353 opts, args = parser.parse_args()
3355 return parser, opts, args
3358 parser, opts, args = parseOpts()
3360 # Open appropriate CookieJar
3361 if opts.cookiefile is None:
3362 jar = cookielib.CookieJar()
3365 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3366 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3368 except (IOError, OSError), err:
3369 sys.exit(u'ERROR: unable to open cookie file')
3372 if opts.dump_user_agent:
3373 print std_headers['User-Agent']
3376 # General configuration
3377 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3378 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3379 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3381 # Batch file verification
3383 if opts.batchfile is not None:
3385 if opts.batchfile == '-':
3388 batchfd = open(opts.batchfile, 'r')
3389 batchurls = batchfd.readlines()
3390 batchurls = [x.strip() for x in batchurls]
3391 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3393 sys.exit(u'ERROR: batch file could not be read')
3394 all_urls = batchurls + args
3396 # Conflicting, missing and erroneous options
3397 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3398 parser.error(u'using .netrc conflicts with giving username/password')
3399 if opts.password is not None and opts.username is None:
3400 parser.error(u'account username missing')
3401 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3402 parser.error(u'using output template conflicts with using title, literal title or auto number')
3403 if opts.usetitle and opts.useliteral:
3404 parser.error(u'using title conflicts with using literal title')
3405 if opts.username is not None and opts.password is None:
3406 opts.password = getpass.getpass(u'Type account password and press return:')
3407 if opts.ratelimit is not None:
3408 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3409 if numeric_limit is None:
3410 parser.error(u'invalid rate limit specified')
3411 opts.ratelimit = numeric_limit
3412 if opts.retries is not None:
3414 opts.retries = long(opts.retries)
3415 except (TypeError, ValueError), err:
3416 parser.error(u'invalid retry count specified')
3418 opts.playliststart = int(opts.playliststart)
3419 if opts.playliststart <= 0:
3420 raise ValueError(u'Playlist start must be positive')
3421 except (TypeError, ValueError), err:
3422 parser.error(u'invalid playlist start number specified')
3424 opts.playlistend = int(opts.playlistend)
3425 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3426 raise ValueError(u'Playlist end must be greater than playlist start')
3427 except (TypeError, ValueError), err:
3428 parser.error(u'invalid playlist end number specified')
3429 if opts.extractaudio:
3430 if opts.audioformat not in ['best', 'aac', 'mp3']:
3431 parser.error(u'invalid audio format specified')
3433 # Information extractors
3434 youtube_ie = YoutubeIE()
3435 metacafe_ie = MetacafeIE(youtube_ie)
3436 dailymotion_ie = DailymotionIE()
3437 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3438 youtube_user_ie = YoutubeUserIE(youtube_ie)
3439 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3440 google_ie = GoogleIE()
3441 google_search_ie = GoogleSearchIE(google_ie)
3442 photobucket_ie = PhotobucketIE()
3443 yahoo_ie = YahooIE()
3444 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3445 deposit_files_ie = DepositFilesIE()
3446 facebook_ie = FacebookIE()
3447 bliptv_ie = BlipTVIE()
3448 vimeo_ie = VimeoIE()
3449 myvideo_ie = MyVideoIE()
3451 generic_ie = GenericIE()
3454 fd = FileDownloader({
3455 'usenetrc': opts.usenetrc,
3456 'username': opts.username,
3457 'password': opts.password,
3458 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3459 'forceurl': opts.geturl,
3460 'forcetitle': opts.gettitle,
3461 'forcethumbnail': opts.getthumbnail,
3462 'forcedescription': opts.getdescription,
3463 'forcefilename': opts.getfilename,
3464 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3465 'format': opts.format,
3466 'format_limit': opts.format_limit,
3467 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3468 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3469 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3470 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3471 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3472 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3473 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3474 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3475 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3476 or u'%(id)s.%(ext)s'),
3477 'ignoreerrors': opts.ignoreerrors,
3478 'ratelimit': opts.ratelimit,
3479 'nooverwrites': opts.nooverwrites,
3480 'retries': opts.retries,
3481 'continuedl': opts.continue_dl,
3482 'noprogress': opts.noprogress,
3483 'playliststart': opts.playliststart,
3484 'playlistend': opts.playlistend,
3485 'logtostderr': opts.outtmpl == '-',
3486 'consoletitle': opts.consoletitle,
3487 'nopart': opts.nopart,
3488 'updatetime': opts.updatetime,
3489 'writedescription': opts.writedescription,
3490 'writeinfojson': opts.writeinfojson,
3492 fd.add_info_extractor(youtube_search_ie)
3493 fd.add_info_extractor(youtube_pl_ie)
3494 fd.add_info_extractor(youtube_user_ie)
3495 fd.add_info_extractor(metacafe_ie)
3496 fd.add_info_extractor(dailymotion_ie)
3497 fd.add_info_extractor(youtube_ie)
3498 fd.add_info_extractor(google_ie)
3499 fd.add_info_extractor(google_search_ie)
3500 fd.add_info_extractor(photobucket_ie)
3501 fd.add_info_extractor(yahoo_ie)
3502 fd.add_info_extractor(yahoo_search_ie)
3503 fd.add_info_extractor(deposit_files_ie)
3504 fd.add_info_extractor(facebook_ie)
3505 fd.add_info_extractor(bliptv_ie)
3506 fd.add_info_extractor(vimeo_ie)
3507 fd.add_info_extractor(myvideo_ie)
3509 # This must come last since it's the
3510 # fallback if none of the others work
3511 fd.add_info_extractor(generic_ie)
3514 if opts.extractaudio:
3515 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3518 if opts.update_self:
3519 updateSelf(fd, sys.argv[0])
3522 if len(all_urls) < 1:
3523 if not opts.update_self:
3524 parser.error(u'you must provide at least one URL')
3527 retcode = fd.download(all_urls)
3529 # Dump cookie jar if requested
3530 if opts.cookiefile is not None:
3533 except (IOError, OSError), err:
3534 sys.exit(u'ERROR: unable to save cookie jar')
3539 if __name__ == '__main__':
3542 except DownloadError:
3544 except SameFileError:
3545 sys.exit(u'ERROR: fixed output name but more than one file to download')
3546 except KeyboardInterrupt:
3547 sys.exit(u'\nERROR: Interrupted by user')
3549 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: