2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # Author: Paweł Paprota
9 # Author: Gergely Imreh
10 # Author: Philipp Hagemeister <phihag@phihag.de>
11 # License: Public domain code
12 from __future__ import with_statement
40 except ImportError: # Python 2.4
43 import cStringIO as StringIO
47 # parse_qs was moved from the cgi module to the urlparse module recently.
49 from urlparse import parse_qs
51 from cgi import parse_qs
55 except ImportError: # Python < 2.6
59 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
60 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
61 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
62 'Accept-Encoding': 'gzip, deflate',
63 'Accept-Language': 'en-us,en;q=0.5',
66 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
70 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
76 def raiseError(msg, i):
77 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
78 def skipSpace(i, expectMore=True):
79 while i < len(s) and s[i] in ' \t\r\n':
83 raiseError('Premature end', i)
85 def decodeEscape(match):
101 return unichr(int(esc[1:5], 16))
102 if len(esc) == 5+6 and esc[5:7] == '\\u':
103 hi = int(esc[1:5], 16)
104 low = int(esc[7:11], 16)
105 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
106 raise ValueError('Unknown escape ' + str(esc))
113 while s[e-bslashes-1] == '\\':
115 if bslashes % 2 == 1:
119 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
120 stri = rexp.sub(decodeEscape, s[i:e])
126 if s[i] == '}': # Empty dictionary
130 raiseError('Expected a string object key', i)
131 i,key = parseString(i)
133 if i >= len(s) or s[i] != ':':
134 raiseError('Expected a colon', i)
141 raiseError('Expected comma or closing curly brace', i)
146 if s[i] == ']': # Empty array
151 i = skipSpace(i) # Raise exception if premature end
155 raiseError('Expected a comma or closing bracket', i)
157 def parseDiscrete(i):
158 for k,v in {'true': True, 'false': False, 'null': None}.items():
159 if s.startswith(k, i):
161 raiseError('Not a boolean (or null)', i)
163 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
165 raiseError('Not a number', i)
167 if '.' in nums or 'e' in nums or 'E' in nums:
168 return (i+len(nums), float(nums))
169 return (i+len(nums), int(nums))
170 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
173 i,res = CHARMAP.get(s[i], parseNumber)(i)
174 i = skipSpace(i, False)
178 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
181 def preferredencoding():
182 """Get preferred encoding.
184 Returns the best encoding scheme for the system, based on
185 locale.getpreferredencoding() and some further tweaks.
187 def yield_preferredencoding():
189 pref = locale.getpreferredencoding()
195 return yield_preferredencoding().next()
197 def htmlentity_transform(matchobj):
198 """Transforms an HTML entity to a Unicode character.
200 This function receives a match object and is intended to be used with
201 the re.sub() function.
203 entity = matchobj.group(1)
205 # Known non-numeric HTML entity
206 if entity in htmlentitydefs.name2codepoint:
207 return unichr(htmlentitydefs.name2codepoint[entity])
210 mobj = re.match(ur'(?u)#(x?\d+)', entity)
212 numstr = mobj.group(1)
213 if numstr.startswith(u'x'):
215 numstr = u'0%s' % numstr
218 return unichr(long(numstr, base))
220 # Unknown entity in name, return its literal representation
221 return (u'&%s;' % entity)
223 def sanitize_title(utitle):
224 """Sanitizes a video title so it could be used as part of a filename."""
225 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
226 return utitle.replace(unicode(os.sep), u'%')
228 def sanitize_open(filename, open_mode):
229 """Try to open the given filename, and slightly tweak it if this fails.
231 Attempts to open the given filename. If this fails, it tries to change
232 the filename slightly, step by step, until it's either able to open it
233 or it fails and raises a final exception, like the standard open()
236 It returns the tuple (stream, definitive_file_name).
240 if sys.platform == 'win32':
242 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
243 return (sys.stdout, filename)
244 stream = open(filename, open_mode)
245 return (stream, filename)
246 except (IOError, OSError), err:
247 # In case of error, try to remove win32 forbidden chars
248 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
250 # An exception here should be caught in the caller
251 stream = open(filename, open_mode)
252 return (stream, filename)
254 def timeconvert(timestr):
255 """Convert RFC 2822 defined time string into system timestamp"""
257 timetuple = email.utils.parsedate_tz(timestr)
258 if timetuple is not None:
259 timestamp = email.utils.mktime_tz(timetuple)
262 class DownloadError(Exception):
263 """Download Error exception.
265 This exception may be thrown by FileDownloader objects if they are not
266 configured to continue on errors. They will contain the appropriate
271 class SameFileError(Exception):
272 """Same File exception.
274 This exception will be thrown by FileDownloader objects if they detect
275 multiple files would have to be downloaded to the same file on disk.
279 class PostProcessingError(Exception):
280 """Post Processing exception.
282 This exception may be raised by PostProcessor's .run() method to
283 indicate an error in the postprocessing task.
287 class UnavailableVideoError(Exception):
288 """Unavailable Format exception.
290 This exception will be thrown when a video is requested
291 in a format that is not available for that video.
295 class ContentTooShortError(Exception):
296 """Content Too Short exception.
298 This exception may be raised by FileDownloader objects when a file they
299 download is too small for what the server announced first, indicating
300 the connection was probably interrupted.
306 def __init__(self, downloaded, expected):
307 self.downloaded = downloaded
308 self.expected = expected
310 class YoutubeDLHandler(urllib2.HTTPHandler):
311 """Handler for HTTP requests and responses.
313 This class, when installed with an OpenerDirector, automatically adds
314 the standard headers to every HTTP request and handles gzipped and
315 deflated responses from web servers. If compression is to be avoided in
316 a particular request, the original request in the program code only has
317 to include the HTTP header "Youtubedl-No-Compression", which will be
318 removed before making the real request.
320 Part of this code was copied from:
322 http://techknack.net/python-urllib2-handlers/
324 Andrew Rowls, the author of that code, agreed to release it to the
331 return zlib.decompress(data, -zlib.MAX_WBITS)
333 return zlib.decompress(data)
336 def addinfourl_wrapper(stream, headers, url, code):
337 if hasattr(urllib2.addinfourl, 'getcode'):
338 return urllib2.addinfourl(stream, headers, url, code)
339 ret = urllib2.addinfourl(stream, headers, url)
343 def http_request(self, req):
344 for h in std_headers:
347 req.add_header(h, std_headers[h])
348 if 'Youtubedl-no-compression' in req.headers:
349 if 'Accept-encoding' in req.headers:
350 del req.headers['Accept-encoding']
351 del req.headers['Youtubedl-no-compression']
354 def http_response(self, req, resp):
357 if resp.headers.get('Content-encoding', '') == 'gzip':
358 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
359 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
360 resp.msg = old_resp.msg
362 if resp.headers.get('Content-encoding', '') == 'deflate':
363 gz = StringIO.StringIO(self.deflate(resp.read()))
364 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
365 resp.msg = old_resp.msg
368 class FileDownloader(object):
369 """File Downloader class.
371 File downloader objects are the ones responsible of downloading the
372 actual video file and writing it to disk if the user has requested
373 it, among some other tasks. In most cases there should be one per
374 program. As, given a video URL, the downloader doesn't know how to
375 extract all the needed information, task that InfoExtractors do, it
376 has to pass the URL to one of them.
378 For this, file downloader objects have a method that allows
379 InfoExtractors to be registered in a given order. When it is passed
380 a URL, the file downloader handles it to the first InfoExtractor it
381 finds that reports being able to handle it. The InfoExtractor extracts
382 all the information about the video or videos the URL refers to, and
383 asks the FileDownloader to process the video information, possibly
384 downloading the video.
386 File downloaders accept a lot of parameters. In order not to saturate
387 the object constructor with arguments, it receives a dictionary of
388 options instead. These options are available through the params
389 attribute for the InfoExtractors to use. The FileDownloader also
390 registers itself as the downloader in charge for the InfoExtractors
391 that are added to it, so this is a "mutual registration".
395 username: Username for authentication purposes.
396 password: Password for authentication purposes.
397 usenetrc: Use netrc for authentication instead.
398 quiet: Do not print messages to stdout.
399 forceurl: Force printing final URL.
400 forcetitle: Force printing title.
401 forcethumbnail: Force printing thumbnail URL.
402 forcedescription: Force printing description.
403 forcefilename: Force printing final filename.
404 simulate: Do not download the video files.
405 format: Video format code.
406 format_limit: Highest quality format to try.
407 outtmpl: Template for output names.
408 ignoreerrors: Do not stop on download errors.
409 ratelimit: Download speed limit, in bytes/sec.
410 nooverwrites: Prevent overwriting files.
411 retries: Number of times to retry for HTTP error 5xx
412 continuedl: Try to continue downloads if possible.
413 noprogress: Do not print the progress bar.
414 playliststart: Playlist item to start at.
415 playlistend: Playlist item to end at.
416 logtostderr: Log messages to stderr instead of stdout.
417 consoletitle: Display progress in console window's titlebar.
418 nopart: Do not use temporary .part files.
419 updatetime: Use the Last-modified header to set output file timestamps.
420 writedescription: Write the video description to a .description file
421 writeinfojson: Write the video description to a .info.json file
427 _download_retcode = None
428 _num_downloads = None
431 def __init__(self, params):
432 """Create a FileDownloader object with the given options."""
435 self._download_retcode = 0
436 self._num_downloads = 0
437 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
441 def pmkdir(filename):
442 """Create directory components in filename. Similar to Unix "mkdir -p"."""
443 components = filename.split(os.sep)
444 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
445 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
446 for dir in aggregate:
447 if not os.path.exists(dir):
451 def format_bytes(bytes):
454 if type(bytes) is str:
459 exponent = long(math.log(bytes, 1024.0))
460 suffix = 'bkMGTPEZY'[exponent]
461 converted = float(bytes) / float(1024**exponent)
462 return '%.2f%s' % (converted, suffix)
465 def calc_percent(byte_counter, data_len):
468 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
471 def calc_eta(start, now, total, current):
475 if current == 0 or dif < 0.001: # One millisecond
477 rate = float(current) / dif
478 eta = long((float(total) - float(current)) / rate)
479 (eta_mins, eta_secs) = divmod(eta, 60)
482 return '%02d:%02d' % (eta_mins, eta_secs)
485 def calc_speed(start, now, bytes):
487 if bytes == 0 or dif < 0.001: # One millisecond
488 return '%10s' % '---b/s'
489 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
492 def best_block_size(elapsed_time, bytes):
493 new_min = max(bytes / 2.0, 1.0)
494 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
495 if elapsed_time < 0.001:
497 rate = bytes / elapsed_time
505 def parse_bytes(bytestr):
506 """Parse a string indicating a byte quantity into a long integer."""
507 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
510 number = float(matchobj.group(1))
511 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
512 return long(round(number * multiplier))
514 def add_info_extractor(self, ie):
515 """Add an InfoExtractor object to the end of the list."""
517 ie.set_downloader(self)
519 def add_post_processor(self, pp):
520 """Add a PostProcessor object to the end of the chain."""
522 pp.set_downloader(self)
524 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
525 """Print message to stdout if not in quiet mode."""
527 if not self.params.get('quiet', False):
528 terminator = [u'\n', u''][skip_eol]
529 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
530 self._screen_file.flush()
531 except (UnicodeEncodeError), err:
532 if not ignore_encoding_errors:
535 def to_stderr(self, message):
536 """Print message to stderr."""
537 print >>sys.stderr, message.encode(preferredencoding())
539 def to_cons_title(self, message):
540 """Set console/terminal window title to message."""
541 if not self.params.get('consoletitle', False):
543 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
544 # c_wchar_p() might not be necessary if `message` is
545 # already of type unicode()
546 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
547 elif 'TERM' in os.environ:
548 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
550 def fixed_template(self):
551 """Checks if the output template is fixed."""
552 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
554 def trouble(self, message=None):
555 """Determine action to take when a download problem appears.
557 Depending on if the downloader has been configured to ignore
558 download errors or not, this method may throw an exception or
559 not when errors are found, after printing the message.
561 if message is not None:
562 self.to_stderr(message)
563 if not self.params.get('ignoreerrors', False):
564 raise DownloadError(message)
565 self._download_retcode = 1
567 def slow_down(self, start_time, byte_counter):
568 """Sleep if the download speed is over the rate limit."""
569 rate_limit = self.params.get('ratelimit', None)
570 if rate_limit is None or byte_counter == 0:
573 elapsed = now - start_time
576 speed = float(byte_counter) / elapsed
577 if speed > rate_limit:
578 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
580 def temp_name(self, filename):
581 """Returns a temporary filename for the given filename."""
582 if self.params.get('nopart', False) or filename == u'-' or \
583 (os.path.exists(filename) and not os.path.isfile(filename)):
585 return filename + u'.part'
587 def undo_temp_name(self, filename):
588 if filename.endswith(u'.part'):
589 return filename[:-len(u'.part')]
592 def try_rename(self, old_filename, new_filename):
594 if old_filename == new_filename:
596 os.rename(old_filename, new_filename)
597 except (IOError, OSError), err:
598 self.trouble(u'ERROR: unable to rename file')
600 def try_utime(self, filename, last_modified_hdr):
601 """Try to set the last-modified time of the given file."""
602 if last_modified_hdr is None:
604 if not os.path.isfile(filename):
606 timestr = last_modified_hdr
609 filetime = timeconvert(timestr)
613 os.utime(filename,(time.time(), filetime))
617 def report_writedescription(self, descfn):
618 """ Report that the description file is being written """
619 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
621 def report_writeinfojson(self, infofn):
622 """ Report that the metadata file has been written """
623 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
625 def report_destination(self, filename):
626 """Report destination filename."""
627 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
629 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
630 """Report download progress."""
631 if self.params.get('noprogress', False):
633 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
634 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
635 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
636 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
638 def report_resuming_byte(self, resume_len):
639 """Report attempt to resume at given byte."""
640 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
642 def report_retry(self, count, retries):
643 """Report retry in case of HTTP error 5xx"""
644 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
646 def report_file_already_downloaded(self, file_name):
647 """Report file has already been fully downloaded."""
649 self.to_screen(u'[download] %s has already been downloaded' % file_name)
650 except (UnicodeEncodeError), err:
651 self.to_screen(u'[download] The file has already been downloaded')
653 def report_unable_to_resume(self):
654 """Report it was impossible to resume download."""
655 self.to_screen(u'[download] Unable to resume')
657 def report_finish(self):
658 """Report download finished."""
659 if self.params.get('noprogress', False):
660 self.to_screen(u'[download] Download completed')
664 def increment_downloads(self):
665 """Increment the ordinal that assigns a number to each file."""
666 self._num_downloads += 1
668 def prepare_filename(self, info_dict):
669 """Generate the output filename."""
671 template_dict = dict(info_dict)
672 template_dict['epoch'] = unicode(long(time.time()))
673 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
674 filename = self.params['outtmpl'] % template_dict
676 except (ValueError, KeyError), err:
677 self.trouble(u'ERROR: invalid system charset or erroneous output template')
680 def process_info(self, info_dict):
681 """Process a single dictionary returned by an InfoExtractor."""
682 filename = self.prepare_filename(info_dict)
683 # Do nothing else if in simulate mode
684 if self.params.get('simulate', False):
686 if self.params.get('forcetitle', False):
687 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
688 if self.params.get('forceurl', False):
689 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
690 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
691 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
692 if self.params.get('forcedescription', False) and 'description' in info_dict:
693 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
694 if self.params.get('forcefilename', False) and filename is not None:
695 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
701 if self.params.get('nooverwrites', False) and os.path.exists(filename):
702 self.to_stderr(u'WARNING: file exists and will be skipped')
706 self.pmkdir(filename)
707 except (OSError, IOError), err:
708 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
711 if self.params.get('writedescription', False):
713 descfn = filename + '.description'
714 self.report_writedescription(descfn)
715 with contextlib.closing(open(descfn, 'wb')) as descfile:
716 descfile.write(info_dict['description'].encode('utf-8'))
717 except (OSError, IOError):
718 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
721 if self.params.get('writeinfojson', False):
722 infofn = filename + '.info.json'
723 self.report_writeinfojson(infofn)
726 except (NameError,AttributeError):
727 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
730 with contextlib.closing(open(infofn, 'wb')) as infof:
731 json.dump(info_dict, infof)
732 except (OSError, IOError):
733 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
737 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
738 except (OSError, IOError), err:
739 raise UnavailableVideoError
740 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
741 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
743 except (ContentTooShortError, ), err:
744 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
749 self.post_process(filename, info_dict)
750 except (PostProcessingError), err:
751 self.trouble(u'ERROR: postprocessing: %s' % str(err))
754 def download(self, url_list):
755 """Download a given list of URLs."""
756 if len(url_list) > 1 and self.fixed_template():
757 raise SameFileError(self.params['outtmpl'])
760 suitable_found = False
762 # Go to next InfoExtractor if not suitable
763 if not ie.suitable(url):
766 # Suitable InfoExtractor found
767 suitable_found = True
769 # Extract information from URL and process it
772 # Suitable InfoExtractor had been found; go to next URL
775 if not suitable_found:
776 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
778 return self._download_retcode
780 def post_process(self, filename, ie_info):
781 """Run the postprocessing chain on the given file."""
783 info['filepath'] = filename
789 def _download_with_rtmpdump(self, filename, url, player_url):
790 self.report_destination(filename)
791 tmpfilename = self.temp_name(filename)
793 # Check for rtmpdump first
795 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
796 except (OSError, IOError):
797 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
800 # Download using rtmpdump. rtmpdump returns exit code 2 when
801 # the connection was interrumpted and resuming appears to be
802 # possible. This is part of rtmpdump's normal usage, AFAIK.
803 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
804 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
805 while retval == 2 or retval == 1:
806 prevsize = os.path.getsize(tmpfilename)
807 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
808 time.sleep(5.0) # This seems to be needed
809 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
810 cursize = os.path.getsize(tmpfilename)
811 if prevsize == cursize and retval == 1:
814 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
815 self.try_rename(tmpfilename, filename)
818 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
821 def _do_download(self, filename, url, player_url):
822 # Check file already present
823 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
824 self.report_file_already_downloaded(filename)
827 # Attempt to download using rtmpdump
828 if url.startswith('rtmp'):
829 return self._download_with_rtmpdump(filename, url, player_url)
831 tmpfilename = self.temp_name(filename)
835 # Do not include the Accept-Encoding header
836 headers = {'Youtubedl-no-compression': 'True'}
837 basic_request = urllib2.Request(url, None, headers)
838 request = urllib2.Request(url, None, headers)
840 # Establish possible resume length
841 if os.path.isfile(tmpfilename):
842 resume_len = os.path.getsize(tmpfilename)
846 # Request parameters in case of being able to resume
847 if self.params.get('continuedl', False) and resume_len != 0:
848 self.report_resuming_byte(resume_len)
849 request.add_header('Range','bytes=%d-' % resume_len)
853 retries = self.params.get('retries', 0)
854 while count <= retries:
855 # Establish connection
857 data = urllib2.urlopen(request)
859 except (urllib2.HTTPError, ), err:
860 if (err.code < 500 or err.code >= 600) and err.code != 416:
861 # Unexpected HTTP error
863 elif err.code == 416:
864 # Unable to resume (requested range not satisfiable)
866 # Open the connection again without the range header
867 data = urllib2.urlopen(basic_request)
868 content_length = data.info()['Content-Length']
869 except (urllib2.HTTPError, ), err:
870 if err.code < 500 or err.code >= 600:
873 # Examine the reported length
874 if (content_length is not None and
875 (resume_len - 100 < long(content_length) < resume_len + 100)):
876 # The file had already been fully downloaded.
877 # Explanation to the above condition: in issue #175 it was revealed that
878 # YouTube sometimes adds or removes a few bytes from the end of the file,
879 # changing the file size slightly and causing problems for some users. So
880 # I decided to implement a suggested change and consider the file
881 # completely downloaded if the file size differs less than 100 bytes from
882 # the one in the hard drive.
883 self.report_file_already_downloaded(filename)
884 self.try_rename(tmpfilename, filename)
887 # The length does not match, we start the download over
888 self.report_unable_to_resume()
894 self.report_retry(count, retries)
897 self.trouble(u'ERROR: giving up after %s retries' % retries)
900 data_len = data.info().get('Content-length', None)
901 if data_len is not None:
902 data_len = long(data_len) + resume_len
903 data_len_str = self.format_bytes(data_len)
904 byte_counter = 0 + resume_len
910 data_block = data.read(block_size)
912 if len(data_block) == 0:
914 byte_counter += len(data_block)
916 # Open file just in time
919 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
920 filename = self.undo_temp_name(tmpfilename)
921 self.report_destination(filename)
922 except (OSError, IOError), err:
923 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
926 stream.write(data_block)
927 except (IOError, OSError), err:
928 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
930 block_size = self.best_block_size(after - before, len(data_block))
933 percent_str = self.calc_percent(byte_counter, data_len)
934 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
935 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
936 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
939 self.slow_down(start, byte_counter - resume_len)
943 if data_len is not None and byte_counter != data_len:
944 raise ContentTooShortError(byte_counter, long(data_len))
945 self.try_rename(tmpfilename, filename)
947 # Update file modification time
948 if self.params.get('updatetime', True):
949 self.try_utime(filename, data.info().get('last-modified', None))
953 class InfoExtractor(object):
954 """Information Extractor class.
956 Information extractors are the classes that, given a URL, extract
957 information from the video (or videos) the URL refers to. This
958 information includes the real video URL, the video title and simplified
959 title, author and others. The information is stored in a dictionary
960 which is then passed to the FileDownloader. The FileDownloader
961 processes this information possibly downloading the video to the file
962 system, among other possible outcomes. The dictionaries must include
963 the following fields:
965 id: Video identifier.
966 url: Final video URL.
967 uploader: Nickname of the video uploader.
968 title: Literal title.
969 stitle: Simplified title.
970 ext: Video filename extension.
971 format: Video format.
972 player_url: SWF Player URL (may be None).
974 The following fields are optional. Their primary purpose is to allow
975 youtube-dl to serve as the backend for a video search function, such
976 as the one in youtube2mp3. They are only used when their respective
977 forced printing functions are called:
979 thumbnail: Full URL to a video thumbnail image.
980 description: One-line video description.
982 Subclasses of this one should re-define the _real_initialize() and
983 _real_extract() methods, as well as the suitable() static method.
984 Probably, they should also be instantiated and added to the main
991 def __init__(self, downloader=None):
992 """Constructor. Receives an optional downloader."""
994 self.set_downloader(downloader)
998 """Receives a URL and returns True if suitable for this IE."""
1001 def initialize(self):
1002 """Initializes an instance (authentication, etc)."""
1004 self._real_initialize()
1007 def extract(self, url):
1008 """Extracts URL information and returns it in list of dicts."""
1010 return self._real_extract(url)
1012 def set_downloader(self, downloader):
1013 """Sets the downloader for this IE."""
1014 self._downloader = downloader
1016 def _real_initialize(self):
1017 """Real initialization process. Redefine in subclasses."""
1020 def _real_extract(self, url):
1021 """Real extraction process. Redefine in subclasses."""
1024 class YoutubeIE(InfoExtractor):
1025 """Information extractor for youtube.com."""
1027 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1028 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1029 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1030 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1031 _NETRC_MACHINE = 'youtube'
1032 # Listed in order of quality
1033 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1034 _video_extensions = {
1040 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1047 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1049 def report_lang(self):
1050 """Report attempt to set language."""
1051 self._downloader.to_screen(u'[youtube] Setting language')
1053 def report_login(self):
1054 """Report attempt to log in."""
1055 self._downloader.to_screen(u'[youtube] Logging in')
1057 def report_age_confirmation(self):
1058 """Report attempt to confirm age."""
1059 self._downloader.to_screen(u'[youtube] Confirming age')
1061 def report_video_webpage_download(self, video_id):
1062 """Report attempt to download video webpage."""
1063 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1065 def report_video_info_webpage_download(self, video_id):
1066 """Report attempt to download video info webpage."""
1067 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1069 def report_information_extraction(self, video_id):
1070 """Report attempt to extract video information."""
1071 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1073 def report_unavailable_format(self, video_id, format):
1074 """Report extracted video URL."""
1075 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1077 def report_rtmp_download(self):
1078 """Indicate the download will use the RTMP protocol."""
1079 self._downloader.to_screen(u'[youtube] RTMP download detected')
1081 def _real_initialize(self):
1082 if self._downloader is None:
1087 downloader_params = self._downloader.params
1089 # Attempt to use provided username and password or .netrc data
1090 if downloader_params.get('username', None) is not None:
1091 username = downloader_params['username']
1092 password = downloader_params['password']
1093 elif downloader_params.get('usenetrc', False):
1095 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1096 if info is not None:
1100 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1101 except (IOError, netrc.NetrcParseError), err:
1102 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1106 request = urllib2.Request(self._LANG_URL)
1109 urllib2.urlopen(request).read()
1110 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1111 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1114 # No authentication to be performed
1115 if username is None:
1120 'current_form': 'loginForm',
1122 'action_login': 'Log In',
1123 'username': username,
1124 'password': password,
1126 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1129 login_results = urllib2.urlopen(request).read()
1130 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1131 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1134 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1140 'action_confirm': 'Confirm',
1142 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1144 self.report_age_confirmation()
1145 age_results = urllib2.urlopen(request).read()
1146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1147 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1150 def _real_extract(self, url):
1151 # Extract video id from URL
1152 mobj = re.match(self._VALID_URL, url)
1154 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1156 video_id = mobj.group(2)
1159 self.report_video_webpage_download(video_id)
1160 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1162 video_webpage = urllib2.urlopen(request).read()
1163 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1164 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1167 # Attempt to extract SWF player URL
1168 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1169 if mobj is not None:
1170 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1175 self.report_video_info_webpage_download(video_id)
1176 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1177 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1178 % (video_id, el_type))
1179 request = urllib2.Request(video_info_url)
1181 video_info_webpage = urllib2.urlopen(request).read()
1182 video_info = parse_qs(video_info_webpage)
1183 if 'token' in video_info:
1185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1188 if 'token' not in video_info:
1189 if 'reason' in video_info:
1190 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1192 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1195 # Start extracting information
1196 self.report_information_extraction(video_id)
1199 if 'author' not in video_info:
1200 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1202 video_uploader = urllib.unquote_plus(video_info['author'][0])
1205 if 'title' not in video_info:
1206 self._downloader.trouble(u'ERROR: unable to extract video title')
1208 video_title = urllib.unquote_plus(video_info['title'][0])
1209 video_title = video_title.decode('utf-8')
1210 video_title = sanitize_title(video_title)
1213 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1214 simple_title = simple_title.strip(ur'_')
1217 if 'thumbnail_url' not in video_info:
1218 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1219 video_thumbnail = ''
1220 else: # don't panic if we can't find it
1221 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1225 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1226 if mobj is not None:
1227 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1228 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1229 for expression in format_expressions:
1231 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1239 video_description = u'No description available.'
1240 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1241 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1242 if mobj is not None:
1243 video_description = mobj.group(1).decode('utf-8')
1245 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1246 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1247 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1248 # TODO use another parser
1251 video_token = urllib.unquote_plus(video_info['token'][0])
1253 # Decide which formats to download
1254 req_format = self._downloader.params.get('format', None)
1256 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1257 self.report_rtmp_download()
1258 video_url_list = [(None, video_info['conn'][0])]
1259 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1260 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1261 url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1262 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1263 url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1265 format_limit = self._downloader.params.get('format_limit', None)
1266 if format_limit is not None and format_limit in self._available_formats:
1267 format_list = self._available_formats[self._available_formats.index(format_limit):]
1269 format_list = self._available_formats
1270 existing_formats = [x for x in format_list if x in url_map]
1271 if len(existing_formats) == 0:
1272 self._downloader.trouble(u'ERROR: no known formats available for video')
1274 if req_format is None:
1275 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1276 elif req_format == '-1':
1277 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1280 if req_format not in url_map:
1281 self._downloader.trouble(u'ERROR: requested format not available')
1283 video_url_list = [(req_format, url_map[req_format])] # Specific format
1285 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1288 for format_param, video_real_url in video_url_list:
1289 # At this point we have a new video
1290 self._downloader.increment_downloads()
1293 video_extension = self._video_extensions.get(format_param, 'flv')
1295 # Find the video URL in fmt_url_map or conn paramters
1297 # Process video information
1298 self._downloader.process_info({
1299 'id': video_id.decode('utf-8'),
1300 'url': video_real_url.decode('utf-8'),
1301 'uploader': video_uploader.decode('utf-8'),
1302 'upload_date': upload_date,
1303 'title': video_title,
1304 'stitle': simple_title,
1305 'ext': video_extension.decode('utf-8'),
1306 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1307 'thumbnail': video_thumbnail.decode('utf-8'),
1308 'description': video_description,
1309 'player_url': player_url,
1311 except UnavailableVideoError, err:
1312 self._downloader.trouble(u'\nERROR: unable to download video')
1315 class MetacafeIE(InfoExtractor):
1316 """Information Extractor for metacafe.com."""
1318 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1319 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1320 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1323 def __init__(self, youtube_ie, downloader=None):
1324 InfoExtractor.__init__(self, downloader)
1325 self._youtube_ie = youtube_ie
1329 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1331 def report_disclaimer(self):
1332 """Report disclaimer retrieval."""
1333 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1335 def report_age_confirmation(self):
1336 """Report attempt to confirm age."""
1337 self._downloader.to_screen(u'[metacafe] Confirming age')
1339 def report_download_webpage(self, video_id):
1340 """Report webpage download."""
1341 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1343 def report_extraction(self, video_id):
1344 """Report information extraction."""
1345 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1347 def _real_initialize(self):
1348 # Retrieve disclaimer
1349 request = urllib2.Request(self._DISCLAIMER)
1351 self.report_disclaimer()
1352 disclaimer = urllib2.urlopen(request).read()
1353 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1354 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1360 'submit': "Continue - I'm over 18",
1362 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1364 self.report_age_confirmation()
1365 disclaimer = urllib2.urlopen(request).read()
1366 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1367 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1370 def _real_extract(self, url):
1371 # Extract id and simplified title from URL
1372 mobj = re.match(self._VALID_URL, url)
1374 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1377 video_id = mobj.group(1)
1379 # Check if video comes from YouTube
1380 mobj2 = re.match(r'^yt-(.*)$', video_id)
1381 if mobj2 is not None:
1382 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1385 # At this point we have a new video
1386 self._downloader.increment_downloads()
1388 simple_title = mobj.group(2).decode('utf-8')
1390 # Retrieve video webpage to extract further information
1391 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1393 self.report_download_webpage(video_id)
1394 webpage = urllib2.urlopen(request).read()
1395 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1396 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1399 # Extract URL, uploader and title from webpage
1400 self.report_extraction(video_id)
1401 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1402 if mobj is not None:
1403 mediaURL = urllib.unquote(mobj.group(1))
1404 video_extension = mediaURL[-3:]
1406 # Extract gdaKey if available
1407 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1409 video_url = mediaURL
1411 gdaKey = mobj.group(1)
1412 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1414 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1416 self._downloader.trouble(u'ERROR: unable to extract media URL')
1418 vardict = parse_qs(mobj.group(1))
1419 if 'mediaData' not in vardict:
1420 self._downloader.trouble(u'ERROR: unable to extract media URL')
1422 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1424 self._downloader.trouble(u'ERROR: unable to extract media URL')
1426 mediaURL = mobj.group(1).replace('\\/', '/')
1427 video_extension = mediaURL[-3:]
1428 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1430 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1432 self._downloader.trouble(u'ERROR: unable to extract title')
1434 video_title = mobj.group(1).decode('utf-8')
1435 video_title = sanitize_title(video_title)
1437 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1439 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1441 video_uploader = mobj.group(1)
1444 # Process video information
1445 self._downloader.process_info({
1446 'id': video_id.decode('utf-8'),
1447 'url': video_url.decode('utf-8'),
1448 'uploader': video_uploader.decode('utf-8'),
1449 'upload_date': u'NA',
1450 'title': video_title,
1451 'stitle': simple_title,
1452 'ext': video_extension.decode('utf-8'),
1456 except UnavailableVideoError:
1457 self._downloader.trouble(u'\nERROR: unable to download video')
1460 class DailymotionIE(InfoExtractor):
1461 """Information Extractor for Dailymotion"""
1463 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1465 def __init__(self, downloader=None):
1466 InfoExtractor.__init__(self, downloader)
1470 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1472 def report_download_webpage(self, video_id):
1473 """Report webpage download."""
1474 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1476 def report_extraction(self, video_id):
1477 """Report information extraction."""
1478 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1480 def _real_initialize(self):
1483 def _real_extract(self, url):
1484 # Extract id and simplified title from URL
1485 mobj = re.match(self._VALID_URL, url)
1487 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1490 # At this point we have a new video
1491 self._downloader.increment_downloads()
1492 video_id = mobj.group(1)
1494 simple_title = mobj.group(2).decode('utf-8')
1495 video_extension = 'flv'
1497 # Retrieve video webpage to extract further information
1498 request = urllib2.Request(url)
1500 self.report_download_webpage(video_id)
1501 webpage = urllib2.urlopen(request).read()
1502 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1503 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1506 # Extract URL, uploader and title from webpage
1507 self.report_extraction(video_id)
1508 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1510 self._downloader.trouble(u'ERROR: unable to extract media URL')
1512 mediaURL = urllib.unquote(mobj.group(1))
1514 # if needed add http://www.dailymotion.com/ if relative URL
1516 video_url = mediaURL
1518 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1519 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1521 self._downloader.trouble(u'ERROR: unable to extract title')
1523 video_title = mobj.group(1).decode('utf-8')
1524 video_title = sanitize_title(video_title)
1526 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1528 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1530 video_uploader = mobj.group(1)
1533 # Process video information
1534 self._downloader.process_info({
1535 'id': video_id.decode('utf-8'),
1536 'url': video_url.decode('utf-8'),
1537 'uploader': video_uploader.decode('utf-8'),
1538 'upload_date': u'NA',
1539 'title': video_title,
1540 'stitle': simple_title,
1541 'ext': video_extension.decode('utf-8'),
1545 except UnavailableVideoError:
1546 self._downloader.trouble(u'\nERROR: unable to download video')
1548 class GoogleIE(InfoExtractor):
1549 """Information extractor for video.google.com."""
1551 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1553 def __init__(self, downloader=None):
1554 InfoExtractor.__init__(self, downloader)
1558 return (re.match(GoogleIE._VALID_URL, url) is not None)
1560 def report_download_webpage(self, video_id):
1561 """Report webpage download."""
1562 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1564 def report_extraction(self, video_id):
1565 """Report information extraction."""
1566 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1568 def _real_initialize(self):
1571 def _real_extract(self, url):
1572 # Extract id from URL
1573 mobj = re.match(self._VALID_URL, url)
1575 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1578 # At this point we have a new video
1579 self._downloader.increment_downloads()
1580 video_id = mobj.group(1)
1582 video_extension = 'mp4'
1584 # Retrieve video webpage to extract further information
1585 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1587 self.report_download_webpage(video_id)
1588 webpage = urllib2.urlopen(request).read()
1589 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1590 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1593 # Extract URL, uploader, and title from webpage
1594 self.report_extraction(video_id)
1595 mobj = re.search(r"download_url:'([^']+)'", webpage)
1597 video_extension = 'flv'
1598 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1600 self._downloader.trouble(u'ERROR: unable to extract media URL')
1602 mediaURL = urllib.unquote(mobj.group(1))
1603 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1604 mediaURL = mediaURL.replace('\\x26', '\x26')
1606 video_url = mediaURL
1608 mobj = re.search(r'<title>(.*)</title>', webpage)
1610 self._downloader.trouble(u'ERROR: unable to extract title')
1612 video_title = mobj.group(1).decode('utf-8')
1613 video_title = sanitize_title(video_title)
1614 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1616 # Extract video description
1617 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1619 self._downloader.trouble(u'ERROR: unable to extract video description')
1621 video_description = mobj.group(1).decode('utf-8')
1622 if not video_description:
1623 video_description = 'No description available.'
1625 # Extract video thumbnail
1626 if self._downloader.params.get('forcethumbnail', False):
1627 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1629 webpage = urllib2.urlopen(request).read()
1630 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1631 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1633 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1635 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1637 video_thumbnail = mobj.group(1)
1638 else: # we need something to pass to process_info
1639 video_thumbnail = ''
1643 # Process video information
1644 self._downloader.process_info({
1645 'id': video_id.decode('utf-8'),
1646 'url': video_url.decode('utf-8'),
1648 'upload_date': u'NA',
1649 'title': video_title,
1650 'stitle': simple_title,
1651 'ext': video_extension.decode('utf-8'),
1655 except UnavailableVideoError:
1656 self._downloader.trouble(u'\nERROR: unable to download video')
1659 class PhotobucketIE(InfoExtractor):
1660 """Information extractor for photobucket.com."""
1662 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1664 def __init__(self, downloader=None):
1665 InfoExtractor.__init__(self, downloader)
1669 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1671 def report_download_webpage(self, video_id):
1672 """Report webpage download."""
1673 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1675 def report_extraction(self, video_id):
1676 """Report information extraction."""
1677 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1679 def _real_initialize(self):
1682 def _real_extract(self, url):
1683 # Extract id from URL
1684 mobj = re.match(self._VALID_URL, url)
1686 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1689 # At this point we have a new video
1690 self._downloader.increment_downloads()
1691 video_id = mobj.group(1)
1693 video_extension = 'flv'
1695 # Retrieve video webpage to extract further information
1696 request = urllib2.Request(url)
1698 self.report_download_webpage(video_id)
1699 webpage = urllib2.urlopen(request).read()
1700 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1701 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1704 # Extract URL, uploader, and title from webpage
1705 self.report_extraction(video_id)
1706 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1708 self._downloader.trouble(u'ERROR: unable to extract media URL')
1710 mediaURL = urllib.unquote(mobj.group(1))
1712 video_url = mediaURL
1714 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1716 self._downloader.trouble(u'ERROR: unable to extract title')
1718 video_title = mobj.group(1).decode('utf-8')
1719 video_title = sanitize_title(video_title)
1720 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1722 video_uploader = mobj.group(2).decode('utf-8')
1725 # Process video information
1726 self._downloader.process_info({
1727 'id': video_id.decode('utf-8'),
1728 'url': video_url.decode('utf-8'),
1729 'uploader': video_uploader,
1730 'upload_date': u'NA',
1731 'title': video_title,
1732 'stitle': simple_title,
1733 'ext': video_extension.decode('utf-8'),
1737 except UnavailableVideoError:
1738 self._downloader.trouble(u'\nERROR: unable to download video')
1741 class YahooIE(InfoExtractor):
1742 """Information extractor for video.yahoo.com."""
1744 # _VALID_URL matches all Yahoo! Video URLs
1745 # _VPAGE_URL matches only the extractable '/watch/' URLs
1746 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1747 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1749 def __init__(self, downloader=None):
1750 InfoExtractor.__init__(self, downloader)
1754 return (re.match(YahooIE._VALID_URL, url) is not None)
1756 def report_download_webpage(self, video_id):
1757 """Report webpage download."""
1758 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1760 def report_extraction(self, video_id):
1761 """Report information extraction."""
1762 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1764 def _real_initialize(self):
1767 def _real_extract(self, url, new_video=True):
1768 # Extract ID from URL
1769 mobj = re.match(self._VALID_URL, url)
1771 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1774 # At this point we have a new video
1775 self._downloader.increment_downloads()
1776 video_id = mobj.group(2)
1777 video_extension = 'flv'
1779 # Rewrite valid but non-extractable URLs as
1780 # extractable English language /watch/ URLs
1781 if re.match(self._VPAGE_URL, url) is None:
1782 request = urllib2.Request(url)
1784 webpage = urllib2.urlopen(request).read()
1785 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1786 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1789 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1791 self._downloader.trouble(u'ERROR: Unable to extract id field')
1793 yahoo_id = mobj.group(1)
1795 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1797 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1799 yahoo_vid = mobj.group(1)
1801 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1802 return self._real_extract(url, new_video=False)
1804 # Retrieve video webpage to extract further information
1805 request = urllib2.Request(url)
1807 self.report_download_webpage(video_id)
1808 webpage = urllib2.urlopen(request).read()
1809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1813 # Extract uploader and title from webpage
1814 self.report_extraction(video_id)
1815 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1817 self._downloader.trouble(u'ERROR: unable to extract video title')
1819 video_title = mobj.group(1).decode('utf-8')
1820 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1822 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1824 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1826 video_uploader = mobj.group(1).decode('utf-8')
1828 # Extract video thumbnail
1829 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1831 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1833 video_thumbnail = mobj.group(1).decode('utf-8')
1835 # Extract video description
1836 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1838 self._downloader.trouble(u'ERROR: unable to extract video description')
1840 video_description = mobj.group(1).decode('utf-8')
1841 if not video_description: video_description = 'No description available.'
1843 # Extract video height and width
1844 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1846 self._downloader.trouble(u'ERROR: unable to extract video height')
1848 yv_video_height = mobj.group(1)
1850 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1852 self._downloader.trouble(u'ERROR: unable to extract video width')
1854 yv_video_width = mobj.group(1)
1856 # Retrieve video playlist to extract media URL
1857 # I'm not completely sure what all these options are, but we
1858 # seem to need most of them, otherwise the server sends a 401.
1859 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1860 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1861 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1862 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1863 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1865 self.report_download_webpage(video_id)
1866 webpage = urllib2.urlopen(request).read()
1867 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1868 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1871 # Extract media URL from playlist XML
1872 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1874 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1876 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1877 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1880 # Process video information
1881 self._downloader.process_info({
1882 'id': video_id.decode('utf-8'),
1884 'uploader': video_uploader,
1885 'upload_date': u'NA',
1886 'title': video_title,
1887 'stitle': simple_title,
1888 'ext': video_extension.decode('utf-8'),
1889 'thumbnail': video_thumbnail.decode('utf-8'),
1890 'description': video_description,
1891 'thumbnail': video_thumbnail,
1892 'description': video_description,
1895 except UnavailableVideoError:
1896 self._downloader.trouble(u'\nERROR: unable to download video')
1899 class GenericIE(InfoExtractor):
1900 """Generic last-resort information extractor."""
1902 def __init__(self, downloader=None):
1903 InfoExtractor.__init__(self, downloader)
1909 def report_download_webpage(self, video_id):
1910 """Report webpage download."""
1911 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1912 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1914 def report_extraction(self, video_id):
1915 """Report information extraction."""
1916 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1918 def _real_initialize(self):
1921 def _real_extract(self, url):
1922 # At this point we have a new video
1923 self._downloader.increment_downloads()
1925 video_id = url.split('/')[-1]
1926 request = urllib2.Request(url)
1928 self.report_download_webpage(video_id)
1929 webpage = urllib2.urlopen(request).read()
1930 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1931 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1933 except ValueError, err:
1934 # since this is the last-resort InfoExtractor, if
1935 # this error is thrown, it'll be thrown here
1936 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1939 self.report_extraction(video_id)
1940 # Start with something easy: JW Player in SWFObject
1941 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1943 # Broaden the search a little bit
1944 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1946 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1949 # It's possible that one of the regexes
1950 # matched, but returned an empty group:
1951 if mobj.group(1) is None:
1952 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1955 video_url = urllib.unquote(mobj.group(1))
1956 video_id = os.path.basename(video_url)
1958 # here's a fun little line of code for you:
1959 video_extension = os.path.splitext(video_id)[1][1:]
1960 video_id = os.path.splitext(video_id)[0]
1962 # it's tempting to parse this further, but you would
1963 # have to take into account all the variations like
1964 # Video Title - Site Name
1965 # Site Name | Video Title
1966 # Video Title - Tagline | Site Name
1967 # and so on and so forth; it's just not practical
1968 mobj = re.search(r'<title>(.*)</title>', webpage)
1970 self._downloader.trouble(u'ERROR: unable to extract title')
1972 video_title = mobj.group(1).decode('utf-8')
1973 video_title = sanitize_title(video_title)
1974 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1976 # video uploader is domain name
1977 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1979 self._downloader.trouble(u'ERROR: unable to extract title')
1981 video_uploader = mobj.group(1).decode('utf-8')
1984 # Process video information
1985 self._downloader.process_info({
1986 'id': video_id.decode('utf-8'),
1987 'url': video_url.decode('utf-8'),
1988 'uploader': video_uploader,
1989 'upload_date': u'NA',
1990 'title': video_title,
1991 'stitle': simple_title,
1992 'ext': video_extension.decode('utf-8'),
1996 except UnavailableVideoError, err:
1997 self._downloader.trouble(u'\nERROR: unable to download video')
2000 class YoutubeSearchIE(InfoExtractor):
2001 """Information Extractor for YouTube search queries."""
2002 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2003 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2004 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2005 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2007 _max_youtube_results = 1000
2009 def __init__(self, youtube_ie, downloader=None):
2010 InfoExtractor.__init__(self, downloader)
2011 self._youtube_ie = youtube_ie
2015 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2017 def report_download_page(self, query, pagenum):
2018 """Report attempt to download playlist page with given number."""
2019 query = query.decode(preferredencoding())
2020 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2022 def _real_initialize(self):
2023 self._youtube_ie.initialize()
2025 def _real_extract(self, query):
2026 mobj = re.match(self._VALID_QUERY, query)
2028 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2031 prefix, query = query.split(':')
2033 query = query.encode('utf-8')
2035 self._download_n_results(query, 1)
2037 elif prefix == 'all':
2038 self._download_n_results(query, self._max_youtube_results)
2044 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2046 elif n > self._max_youtube_results:
2047 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2048 n = self._max_youtube_results
2049 self._download_n_results(query, n)
2051 except ValueError: # parsing prefix as integer fails
2052 self._download_n_results(query, 1)
2055 def _download_n_results(self, query, n):
2056 """Downloads a specified number of results for a query"""
2059 already_seen = set()
2063 self.report_download_page(query, pagenum)
2064 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2065 request = urllib2.Request(result_url)
2067 page = urllib2.urlopen(request).read()
2068 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2069 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2072 # Extract video identifiers
2073 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2074 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2075 if video_id not in already_seen:
2076 video_ids.append(video_id)
2077 already_seen.add(video_id)
2078 if len(video_ids) == n:
2079 # Specified n videos reached
2080 for id in video_ids:
2081 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2084 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2085 for id in video_ids:
2086 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2089 pagenum = pagenum + 1
2091 class GoogleSearchIE(InfoExtractor):
2092 """Information Extractor for Google Video search queries."""
2093 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2094 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2095 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2096 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2098 _max_google_results = 1000
2100 def __init__(self, google_ie, downloader=None):
2101 InfoExtractor.__init__(self, downloader)
2102 self._google_ie = google_ie
2106 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2108 def report_download_page(self, query, pagenum):
2109 """Report attempt to download playlist page with given number."""
2110 query = query.decode(preferredencoding())
2111 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2113 def _real_initialize(self):
2114 self._google_ie.initialize()
2116 def _real_extract(self, query):
2117 mobj = re.match(self._VALID_QUERY, query)
2119 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2122 prefix, query = query.split(':')
2124 query = query.encode('utf-8')
2126 self._download_n_results(query, 1)
2128 elif prefix == 'all':
2129 self._download_n_results(query, self._max_google_results)
2135 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2137 elif n > self._max_google_results:
2138 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2139 n = self._max_google_results
2140 self._download_n_results(query, n)
2142 except ValueError: # parsing prefix as integer fails
2143 self._download_n_results(query, 1)
2146 def _download_n_results(self, query, n):
2147 """Downloads a specified number of results for a query"""
2150 already_seen = set()
2154 self.report_download_page(query, pagenum)
2155 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2156 request = urllib2.Request(result_url)
2158 page = urllib2.urlopen(request).read()
2159 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2160 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2163 # Extract video identifiers
2164 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2165 video_id = mobj.group(1)
2166 if video_id not in already_seen:
2167 video_ids.append(video_id)
2168 already_seen.add(video_id)
2169 if len(video_ids) == n:
2170 # Specified n videos reached
2171 for id in video_ids:
2172 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2175 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2176 for id in video_ids:
2177 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2180 pagenum = pagenum + 1
2182 class YahooSearchIE(InfoExtractor):
2183 """Information Extractor for Yahoo! Video search queries."""
2184 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2185 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2186 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2187 _MORE_PAGES_INDICATOR = r'\s*Next'
2189 _max_yahoo_results = 1000
2191 def __init__(self, yahoo_ie, downloader=None):
2192 InfoExtractor.__init__(self, downloader)
2193 self._yahoo_ie = yahoo_ie
2197 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2199 def report_download_page(self, query, pagenum):
2200 """Report attempt to download playlist page with given number."""
2201 query = query.decode(preferredencoding())
2202 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2204 def _real_initialize(self):
2205 self._yahoo_ie.initialize()
2207 def _real_extract(self, query):
2208 mobj = re.match(self._VALID_QUERY, query)
2210 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2213 prefix, query = query.split(':')
2215 query = query.encode('utf-8')
2217 self._download_n_results(query, 1)
2219 elif prefix == 'all':
2220 self._download_n_results(query, self._max_yahoo_results)
2226 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2228 elif n > self._max_yahoo_results:
2229 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2230 n = self._max_yahoo_results
2231 self._download_n_results(query, n)
2233 except ValueError: # parsing prefix as integer fails
2234 self._download_n_results(query, 1)
2237 def _download_n_results(self, query, n):
2238 """Downloads a specified number of results for a query"""
2241 already_seen = set()
2245 self.report_download_page(query, pagenum)
2246 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2247 request = urllib2.Request(result_url)
2249 page = urllib2.urlopen(request).read()
2250 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2251 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2254 # Extract video identifiers
2255 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2256 video_id = mobj.group(1)
2257 if video_id not in already_seen:
2258 video_ids.append(video_id)
2259 already_seen.add(video_id)
2260 if len(video_ids) == n:
2261 # Specified n videos reached
2262 for id in video_ids:
2263 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2266 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2267 for id in video_ids:
2268 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2271 pagenum = pagenum + 1
2273 class YoutubePlaylistIE(InfoExtractor):
2274 """Information Extractor for YouTube playlists."""
2276 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2277 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2278 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2279 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2282 def __init__(self, youtube_ie, downloader=None):
2283 InfoExtractor.__init__(self, downloader)
2284 self._youtube_ie = youtube_ie
2288 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2290 def report_download_page(self, playlist_id, pagenum):
2291 """Report attempt to download playlist page with given number."""
2292 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2294 def _real_initialize(self):
2295 self._youtube_ie.initialize()
2297 def _real_extract(self, url):
2298 # Extract playlist id
2299 mobj = re.match(self._VALID_URL, url)
2301 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2305 if mobj.group(3) is not None:
2306 self._youtube_ie.extract(mobj.group(3))
2309 # Download playlist pages
2310 # prefix is 'p' as default for playlists but there are other types that need extra care
2311 playlist_prefix = mobj.group(1)
2312 if playlist_prefix == 'a':
2313 playlist_access = 'artist'
2315 playlist_prefix = 'p'
2316 playlist_access = 'view_play_list'
2317 playlist_id = mobj.group(2)
2322 self.report_download_page(playlist_id, pagenum)
2323 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2325 page = urllib2.urlopen(request).read()
2326 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2327 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2330 # Extract video identifiers
2332 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2333 if mobj.group(1) not in ids_in_page:
2334 ids_in_page.append(mobj.group(1))
2335 video_ids.extend(ids_in_page)
2337 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2339 pagenum = pagenum + 1
2341 playliststart = self._downloader.params.get('playliststart', 1) - 1
2342 playlistend = self._downloader.params.get('playlistend', -1)
2343 video_ids = video_ids[playliststart:playlistend]
2345 for id in video_ids:
2346 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2349 class YoutubeUserIE(InfoExtractor):
2350 """Information Extractor for YouTube users."""
2352 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2353 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2354 _GDATA_PAGE_SIZE = 50
2355 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2356 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2359 def __init__(self, youtube_ie, downloader=None):
2360 InfoExtractor.__init__(self, downloader)
2361 self._youtube_ie = youtube_ie
2365 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2367 def report_download_page(self, username, start_index):
2368 """Report attempt to download user page."""
2369 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2370 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2372 def _real_initialize(self):
2373 self._youtube_ie.initialize()
2375 def _real_extract(self, url):
2377 mobj = re.match(self._VALID_URL, url)
2379 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2382 username = mobj.group(1)
2384 # Download video ids using YouTube Data API. Result size per
2385 # query is limited (currently to 50 videos) so we need to query
2386 # page by page until there are no video ids - it means we got
2393 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2394 self.report_download_page(username, start_index)
2396 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2399 page = urllib2.urlopen(request).read()
2400 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2401 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2404 # Extract video identifiers
2407 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2408 if mobj.group(1) not in ids_in_page:
2409 ids_in_page.append(mobj.group(1))
2411 video_ids.extend(ids_in_page)
2413 # A little optimization - if current page is not
2414 # "full", ie. does not contain PAGE_SIZE video ids then
2415 # we can assume that this page is the last one - there
2416 # are no more ids on further pages - no need to query
2419 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2424 all_ids_count = len(video_ids)
2425 playliststart = self._downloader.params.get('playliststart', 1) - 1
2426 playlistend = self._downloader.params.get('playlistend', -1)
2428 if playlistend == -1:
2429 video_ids = video_ids[playliststart:]
2431 video_ids = video_ids[playliststart:playlistend]
2433 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2434 (username, all_ids_count, len(video_ids)))
2436 for video_id in video_ids:
2437 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2440 class DepositFilesIE(InfoExtractor):
2441 """Information extractor for depositfiles.com"""
2443 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2445 def __init__(self, downloader=None):
2446 InfoExtractor.__init__(self, downloader)
2450 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2452 def report_download_webpage(self, file_id):
2453 """Report webpage download."""
2454 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2456 def report_extraction(self, file_id):
2457 """Report information extraction."""
2458 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2460 def _real_initialize(self):
2463 def _real_extract(self, url):
2464 # At this point we have a new file
2465 self._downloader.increment_downloads()
2467 file_id = url.split('/')[-1]
2468 # Rebuild url in english locale
2469 url = 'http://depositfiles.com/en/files/' + file_id
2471 # Retrieve file webpage with 'Free download' button pressed
2472 free_download_indication = { 'gateway_result' : '1' }
2473 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2475 self.report_download_webpage(file_id)
2476 webpage = urllib2.urlopen(request).read()
2477 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2478 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2481 # Search for the real file URL
2482 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2483 if (mobj is None) or (mobj.group(1) is None):
2484 # Try to figure out reason of the error.
2485 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2486 if (mobj is not None) and (mobj.group(1) is not None):
2487 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2488 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2490 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2493 file_url = mobj.group(1)
2494 file_extension = os.path.splitext(file_url)[1][1:]
2496 # Search for file title
2497 mobj = re.search(r'<b title="(.*?)">', webpage)
2499 self._downloader.trouble(u'ERROR: unable to extract title')
2501 file_title = mobj.group(1).decode('utf-8')
2504 # Process file information
2505 self._downloader.process_info({
2506 'id': file_id.decode('utf-8'),
2507 'url': file_url.decode('utf-8'),
2509 'upload_date': u'NA',
2510 'title': file_title,
2511 'stitle': file_title,
2512 'ext': file_extension.decode('utf-8'),
2516 except UnavailableVideoError, err:
2517 self._downloader.trouble(u'ERROR: unable to download file')
2519 class FacebookIE(InfoExtractor):
2520 """Information Extractor for Facebook"""
2522 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2523 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2524 _NETRC_MACHINE = 'facebook'
2525 _available_formats = ['highqual', 'lowqual']
2526 _video_extensions = {
2531 def __init__(self, downloader=None):
2532 InfoExtractor.__init__(self, downloader)
2536 return (re.match(FacebookIE._VALID_URL, url) is not None)
2538 def _reporter(self, message):
2539 """Add header and report message."""
2540 self._downloader.to_screen(u'[facebook] %s' % message)
2542 def report_login(self):
2543 """Report attempt to log in."""
2544 self._reporter(u'Logging in')
2546 def report_video_webpage_download(self, video_id):
2547 """Report attempt to download video webpage."""
2548 self._reporter(u'%s: Downloading video webpage' % video_id)
2550 def report_information_extraction(self, video_id):
2551 """Report attempt to extract video information."""
2552 self._reporter(u'%s: Extracting video information' % video_id)
2554 def _parse_page(self, video_webpage):
2555 """Extract video information from page"""
2557 data = {'title': r'class="video_title datawrap">(.*?)</',
2558 'description': r'<div class="datawrap">(.*?)</div>',
2559 'owner': r'\("video_owner_name", "(.*?)"\)',
2560 'upload_date': r'data-date="(.*?)"',
2561 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2564 for piece in data.keys():
2565 mobj = re.search(data[piece], video_webpage)
2566 if mobj is not None:
2567 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2571 for fmt in self._available_formats:
2572 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2573 if mobj is not None:
2574 # URL is in a Javascript segment inside an escaped Unicode format within
2575 # the generally utf-8 page
2576 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2577 video_info['video_urls'] = video_urls
2581 def _real_initialize(self):
2582 if self._downloader is None:
2587 downloader_params = self._downloader.params
2589 # Attempt to use provided username and password or .netrc data
2590 if downloader_params.get('username', None) is not None:
2591 useremail = downloader_params['username']
2592 password = downloader_params['password']
2593 elif downloader_params.get('usenetrc', False):
2595 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2596 if info is not None:
2600 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2601 except (IOError, netrc.NetrcParseError), err:
2602 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2605 if useremail is None:
2614 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2617 login_results = urllib2.urlopen(request).read()
2618 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2619 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2621 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2622 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2625 def _real_extract(self, url):
2626 mobj = re.match(self._VALID_URL, url)
2628 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2630 video_id = mobj.group('ID')
2633 self.report_video_webpage_download(video_id)
2634 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2636 page = urllib2.urlopen(request)
2637 video_webpage = page.read()
2638 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2639 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2642 # Start extracting information
2643 self.report_information_extraction(video_id)
2645 # Extract information
2646 video_info = self._parse_page(video_webpage)
2649 if 'owner' not in video_info:
2650 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2652 video_uploader = video_info['owner']
2655 if 'title' not in video_info:
2656 self._downloader.trouble(u'ERROR: unable to extract video title')
2658 video_title = video_info['title']
2659 video_title = video_title.decode('utf-8')
2660 video_title = sanitize_title(video_title)
2663 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2664 simple_title = simple_title.strip(ur'_')
2667 if 'thumbnail' not in video_info:
2668 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2669 video_thumbnail = ''
2671 video_thumbnail = video_info['thumbnail']
2675 if 'upload_date' in video_info:
2676 upload_time = video_info['upload_date']
2677 timetuple = email.utils.parsedate_tz(upload_time)
2678 if timetuple is not None:
2680 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2685 video_description = video_info.get('description', 'No description available.')
2687 url_map = video_info['video_urls']
2688 if len(url_map.keys()) > 0:
2689 # Decide which formats to download
2690 req_format = self._downloader.params.get('format', None)
2691 format_limit = self._downloader.params.get('format_limit', None)
2693 if format_limit is not None and format_limit in self._available_formats:
2694 format_list = self._available_formats[self._available_formats.index(format_limit):]
2696 format_list = self._available_formats
2697 existing_formats = [x for x in format_list if x in url_map]
2698 if len(existing_formats) == 0:
2699 self._downloader.trouble(u'ERROR: no known formats available for video')
2701 if req_format is None:
2702 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2703 elif req_format == '-1':
2704 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2707 if req_format not in url_map:
2708 self._downloader.trouble(u'ERROR: requested format not available')
2710 video_url_list = [(req_format, url_map[req_format])] # Specific format
2712 for format_param, video_real_url in video_url_list:
2714 # At this point we have a new video
2715 self._downloader.increment_downloads()
2718 video_extension = self._video_extensions.get(format_param, 'mp4')
2720 # Find the video URL in fmt_url_map or conn paramters
2722 # Process video information
2723 self._downloader.process_info({
2724 'id': video_id.decode('utf-8'),
2725 'url': video_real_url.decode('utf-8'),
2726 'uploader': video_uploader.decode('utf-8'),
2727 'upload_date': upload_date,
2728 'title': video_title,
2729 'stitle': simple_title,
2730 'ext': video_extension.decode('utf-8'),
2731 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2732 'thumbnail': video_thumbnail.decode('utf-8'),
2733 'description': video_description.decode('utf-8'),
2736 except UnavailableVideoError, err:
2737 self._downloader.trouble(u'\nERROR: unable to download video')
2739 class BlipTVIE(InfoExtractor):
2740 """Information extractor for blip.tv"""
2742 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2743 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2747 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2749 def report_extraction(self, file_id):
2750 """Report information extraction."""
2751 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2753 def _simplify_title(self, title):
2754 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2755 res = res.strip(ur'_')
2758 def _real_extract(self, url):
2759 mobj = re.match(self._VALID_URL, url)
2761 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2764 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2765 request = urllib2.Request(json_url)
2766 self.report_extraction(mobj.group(1))
2768 json_code = urllib2.urlopen(request).read()
2769 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2770 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2773 json_data = json.loads(json_code)
2774 data = json_data['Post'] if 'Post' in json_data else json_data
2776 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2777 video_url = data['media']['url']
2778 umobj = re.match(self._URL_EXT, video_url)
2780 raise ValueError('Can not determine filename extension')
2781 ext = umobj.group(1)
2783 self._downloader.increment_downloads()
2786 'id': data['item_id'],
2788 'uploader': data['display_name'],
2789 'upload_date': upload_date,
2790 'title': data['title'],
2791 'stitle': self._simplify_title(data['title']),
2793 'format': data['media']['mimeType'],
2794 'thumbnail': data['thumbnailUrl'],
2795 'description': data['description'],
2796 'player_url': data['embedUrl']
2798 except (ValueError,KeyError), err:
2799 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2803 self._downloader.process_info(info)
2804 except UnavailableVideoError, err:
2805 self._downloader.trouble(u'\nERROR: unable to download video')
2808 class PostProcessor(object):
2809 """Post Processor class.
2811 PostProcessor objects can be added to downloaders with their
2812 add_post_processor() method. When the downloader has finished a
2813 successful download, it will take its internal chain of PostProcessors
2814 and start calling the run() method on each one of them, first with
2815 an initial argument and then with the returned value of the previous
2818 The chain will be stopped if one of them ever returns None or the end
2819 of the chain is reached.
2821 PostProcessor objects follow a "mutual registration" process similar
2822 to InfoExtractor objects.
2827 def __init__(self, downloader=None):
2828 self._downloader = downloader
2830 def set_downloader(self, downloader):
2831 """Sets the downloader for this PP."""
2832 self._downloader = downloader
2834 def run(self, information):
2835 """Run the PostProcessor.
2837 The "information" argument is a dictionary like the ones
2838 composed by InfoExtractors. The only difference is that this
2839 one has an extra field called "filepath" that points to the
2842 When this method returns None, the postprocessing chain is
2843 stopped. However, this method may return an information
2844 dictionary that will be passed to the next postprocessing
2845 object in the chain. It can be the one it received after
2846 changing some fields.
2848 In addition, this method may raise a PostProcessingError
2849 exception that will be taken into account by the downloader
2852 return information # by default, do nothing
2854 class FFmpegExtractAudioPP(PostProcessor):
2856 def __init__(self, downloader=None, preferredcodec=None):
2857 PostProcessor.__init__(self, downloader)
2858 if preferredcodec is None:
2859 preferredcodec = 'best'
2860 self._preferredcodec = preferredcodec
2863 def get_audio_codec(path):
2865 cmd = ['ffprobe', '-show_streams', '--', path]
2866 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2867 output = handle.communicate()[0]
2868 if handle.wait() != 0:
2870 except (IOError, OSError):
2873 for line in output.split('\n'):
2874 if line.startswith('codec_name='):
2875 audio_codec = line.split('=')[1].strip()
2876 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2881 def run_ffmpeg(path, out_path, codec, more_opts):
2883 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2884 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2886 except (IOError, OSError):
2889 def run(self, information):
2890 path = information['filepath']
2892 filecodec = self.get_audio_codec(path)
2893 if filecodec is None:
2894 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2898 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2899 if filecodec == 'aac' or filecodec == 'mp3':
2900 # Lossless if possible
2902 extension = filecodec
2903 if filecodec == 'aac':
2904 more_opts = ['-f', 'adts']
2907 acodec = 'libmp3lame'
2909 more_opts = ['-ab', '128k']
2911 # We convert the audio (lossy)
2912 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2913 extension = self._preferredcodec
2914 more_opts = ['-ab', '128k']
2915 if self._preferredcodec == 'aac':
2916 more_opts += ['-f', 'adts']
2918 (prefix, ext) = os.path.splitext(path)
2919 new_path = prefix + '.' + extension
2920 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2921 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2924 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2929 except (IOError, OSError):
2930 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2933 information['filepath'] = new_path
2936 ### MAIN PROGRAM ###
2937 if __name__ == '__main__':
2939 # Modules needed only when running the main program
2943 # Function to update the program file with the latest version from the repository.
2944 def update_self(downloader, filename):
2945 # Note: downloader only used for options
2946 if not os.access(filename, os.W_OK):
2947 sys.exit('ERROR: no write permissions on %s' % filename)
2949 downloader.to_screen('Updating to latest stable version...')
2951 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2952 latest_version = urllib.urlopen(latest_url).read().strip()
2953 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2954 newcontent = urllib.urlopen(prog_url).read()
2955 except (IOError, OSError), err:
2956 sys.exit('ERROR: unable to download latest version')
2958 stream = open(filename, 'w')
2959 stream.write(newcontent)
2961 except (IOError, OSError), err:
2962 sys.exit('ERROR: unable to overwrite current version')
2963 downloader.to_screen('Updated to version %s' % latest_version)
2965 # Parse command line
2966 parser = optparse.OptionParser(
2967 usage='Usage: %prog [options] url...',
2968 version='2011.07.09-phihag',
2969 conflict_handler='resolve',
2972 parser.add_option('-h', '--help',
2973 action='help', help='print this help text and exit')
2974 parser.add_option('-v', '--version',
2975 action='version', help='print program version and exit')
2976 parser.add_option('-U', '--update',
2977 action='store_true', dest='update_self', help='update this program to latest stable version')
2978 parser.add_option('-i', '--ignore-errors',
2979 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2980 parser.add_option('-r', '--rate-limit',
2981 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2982 parser.add_option('-R', '--retries',
2983 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2984 parser.add_option('--playlist-start',
2985 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2986 parser.add_option('--playlist-end',
2987 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2988 parser.add_option('--dump-user-agent',
2989 action='store_true', dest='dump_user_agent',
2990 help='display the current browser identification', default=False)
2992 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2993 authentication.add_option('-u', '--username',
2994 dest='username', metavar='USERNAME', help='account username')
2995 authentication.add_option('-p', '--password',
2996 dest='password', metavar='PASSWORD', help='account password')
2997 authentication.add_option('-n', '--netrc',
2998 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2999 parser.add_option_group(authentication)
3001 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3002 video_format.add_option('-f', '--format',
3003 action='store', dest='format', metavar='FORMAT', help='video format code')
3004 video_format.add_option('--all-formats',
3005 action='store_const', dest='format', help='download all available video formats', const='-1')
3006 video_format.add_option('--max-quality',
3007 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3008 parser.add_option_group(video_format)
3010 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3011 verbosity.add_option('-q', '--quiet',
3012 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3013 verbosity.add_option('-s', '--simulate',
3014 action='store_true', dest='simulate', help='do not download video', default=False)
3015 verbosity.add_option('-g', '--get-url',
3016 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3017 verbosity.add_option('-e', '--get-title',
3018 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3019 verbosity.add_option('--get-thumbnail',
3020 action='store_true', dest='getthumbnail',
3021 help='simulate, quiet but print thumbnail URL', default=False)
3022 verbosity.add_option('--get-description',
3023 action='store_true', dest='getdescription',
3024 help='simulate, quiet but print video description', default=False)
3025 verbosity.add_option('--get-filename',
3026 action='store_true', dest='getfilename',
3027 help='simulate, quiet but print output filename', default=False)
3028 verbosity.add_option('--no-progress',
3029 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3030 verbosity.add_option('--console-title',
3031 action='store_true', dest='consoletitle',
3032 help='display progress in console titlebar', default=False)
3033 parser.add_option_group(verbosity)
3035 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3036 filesystem.add_option('-t', '--title',
3037 action='store_true', dest='usetitle', help='use title in file name', default=False)
3038 filesystem.add_option('-l', '--literal',
3039 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3040 filesystem.add_option('-A', '--auto-number',
3041 action='store_true', dest='autonumber',
3042 help='number downloaded files starting from 00000', default=False)
3043 filesystem.add_option('-o', '--output',
3044 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3045 filesystem.add_option('-a', '--batch-file',
3046 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3047 filesystem.add_option('-w', '--no-overwrites',
3048 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3049 filesystem.add_option('-c', '--continue',
3050 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3051 filesystem.add_option('--cookies',
3052 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3053 filesystem.add_option('--no-part',
3054 action='store_true', dest='nopart', help='do not use .part files', default=False)
3055 filesystem.add_option('--no-mtime',
3056 action='store_false', dest='updatetime',
3057 help='do not use the Last-modified header to set the file modification time', default=True)
3058 filesystem.add_option('--write-description',
3059 action='store_true', dest='writedescription',
3060 help='write video description to a .description file', default=False)
3061 filesystem.add_option('--write-info-json',
3062 action='store_true', dest='writeinfojson',
3063 help='write video metadata to a .info.json file', default=False)
3064 parser.add_option_group(filesystem)
3066 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3067 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3068 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3069 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3070 help='"best", "aac" or "mp3"; best by default')
3071 parser.add_option_group(postproc)
3073 (opts, args) = parser.parse_args()
3075 # Open appropriate CookieJar
3076 if opts.cookiefile is None:
3077 jar = cookielib.CookieJar()
3080 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3081 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3083 except (IOError, OSError), err:
3084 sys.exit(u'ERROR: unable to open cookie file')
3087 if opts.dump_user_agent:
3088 print std_headers['User-Agent']
3091 # General configuration
3092 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3093 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3094 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3096 # Batch file verification
3098 if opts.batchfile is not None:
3100 if opts.batchfile == '-':
3103 batchfd = open(opts.batchfile, 'r')
3104 batchurls = batchfd.readlines()
3105 batchurls = [x.strip() for x in batchurls]
3106 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3108 sys.exit(u'ERROR: batch file could not be read')
3109 all_urls = batchurls + args
3111 # Conflicting, missing and erroneous options
3112 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3113 parser.error(u'using .netrc conflicts with giving username/password')
3114 if opts.password is not None and opts.username is None:
3115 parser.error(u'account username missing')
3116 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3117 parser.error(u'using output template conflicts with using title, literal title or auto number')
3118 if opts.usetitle and opts.useliteral:
3119 parser.error(u'using title conflicts with using literal title')
3120 if opts.username is not None and opts.password is None:
3121 opts.password = getpass.getpass(u'Type account password and press return:')
3122 if opts.ratelimit is not None:
3123 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3124 if numeric_limit is None:
3125 parser.error(u'invalid rate limit specified')
3126 opts.ratelimit = numeric_limit
3127 if opts.retries is not None:
3129 opts.retries = long(opts.retries)
3130 except (TypeError, ValueError), err:
3131 parser.error(u'invalid retry count specified')
3133 opts.playliststart = long(opts.playliststart)
3134 if opts.playliststart <= 0:
3136 except (TypeError, ValueError), err:
3137 parser.error(u'invalid playlist start number specified')
3139 opts.playlistend = long(opts.playlistend)
3140 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3142 except (TypeError, ValueError), err:
3143 parser.error(u'invalid playlist end number specified')
3144 if opts.extractaudio:
3145 if opts.audioformat not in ['best', 'aac', 'mp3']:
3146 parser.error(u'invalid audio format specified')
3148 # Information extractors
3149 youtube_ie = YoutubeIE()
3150 metacafe_ie = MetacafeIE(youtube_ie)
3151 dailymotion_ie = DailymotionIE()
3152 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3153 youtube_user_ie = YoutubeUserIE(youtube_ie)
3154 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3155 google_ie = GoogleIE()
3156 google_search_ie = GoogleSearchIE(google_ie)
3157 photobucket_ie = PhotobucketIE()
3158 yahoo_ie = YahooIE()
3159 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3160 deposit_files_ie = DepositFilesIE()
3161 facebook_ie = FacebookIE()
3162 bliptv_ie = BlipTVIE()
3163 generic_ie = GenericIE()
3166 fd = FileDownloader({
3167 'usenetrc': opts.usenetrc,
3168 'username': opts.username,
3169 'password': opts.password,
3170 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3171 'forceurl': opts.geturl,
3172 'forcetitle': opts.gettitle,
3173 'forcethumbnail': opts.getthumbnail,
3174 'forcedescription': opts.getdescription,
3175 'forcefilename': opts.getfilename,
3176 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3177 'format': opts.format,
3178 'format_limit': opts.format_limit,
3179 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3180 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3181 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3182 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3183 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3184 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3185 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3186 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3187 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3188 or u'%(id)s.%(ext)s'),
3189 'ignoreerrors': opts.ignoreerrors,
3190 'ratelimit': opts.ratelimit,
3191 'nooverwrites': opts.nooverwrites,
3192 'retries': opts.retries,
3193 'continuedl': opts.continue_dl,
3194 'noprogress': opts.noprogress,
3195 'playliststart': opts.playliststart,
3196 'playlistend': opts.playlistend,
3197 'logtostderr': opts.outtmpl == '-',
3198 'consoletitle': opts.consoletitle,
3199 'nopart': opts.nopart,
3200 'updatetime': opts.updatetime,
3201 'writedescription': opts.writedescription,
3202 'writeinfojson': opts.writeinfojson,
3204 fd.add_info_extractor(youtube_search_ie)
3205 fd.add_info_extractor(youtube_pl_ie)
3206 fd.add_info_extractor(youtube_user_ie)
3207 fd.add_info_extractor(metacafe_ie)
3208 fd.add_info_extractor(dailymotion_ie)
3209 fd.add_info_extractor(youtube_ie)
3210 fd.add_info_extractor(google_ie)
3211 fd.add_info_extractor(google_search_ie)
3212 fd.add_info_extractor(photobucket_ie)
3213 fd.add_info_extractor(yahoo_ie)
3214 fd.add_info_extractor(yahoo_search_ie)
3215 fd.add_info_extractor(deposit_files_ie)
3216 fd.add_info_extractor(facebook_ie)
3217 fd.add_info_extractor(bliptv_ie)
3219 # This must come last since it's the
3220 # fallback if none of the others work
3221 fd.add_info_extractor(generic_ie)
3224 if opts.extractaudio:
3225 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3228 if opts.update_self:
3229 update_self(fd, sys.argv[0])
3232 if len(all_urls) < 1:
3233 if not opts.update_self:
3234 parser.error(u'you must provide at least one URL')
3237 retcode = fd.download(all_urls)
3239 # Dump cookie jar if requested
3240 if opts.cookiefile is not None:
3243 except (IOError, OSError), err:
3244 sys.exit(u'ERROR: unable to save cookie jar')
3248 except DownloadError:
3250 except SameFileError:
3251 sys.exit(u'ERROR: fixed output name but more than one file to download')
3252 except KeyboardInterrupt:
3253 sys.exit(u'\nERROR: Interrupted by user')