2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.15'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
635 def report_writedescription(self, descfn):
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
643 def report_destination(self, filename):
644 """Report destination filename."""
645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
649 if self.params.get('noprogress', False):
651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
656 def report_resuming_byte(self, resume_len):
657 """Report attempt to resume at given byte."""
658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
660 def report_retry(self, count, retries):
661 """Report retry in case of HTTP error 5xx"""
662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
668 except (UnicodeEncodeError), err:
669 self.to_screen(u'[download] The file has already been downloaded')
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
673 self.to_screen(u'[download] Unable to resume')
675 def report_finish(self):
676 """Report download finished."""
677 if self.params.get('noprogress', False):
678 self.to_screen(u'[download] Download completed')
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
700 filename = self.prepare_filename(info_dict)
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
714 # Do nothing else if in simulate mode
715 if self.params.get('simulate', False):
721 matchtitle=self.params.get('matchtitle',False)
722 rejecttitle=self.params.get('rejecttitle',False)
723 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
724 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
725 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
727 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
728 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
731 if self.params.get('nooverwrites', False) and os.path.exists(filename):
732 self.to_stderr(u'WARNING: file exists and will be skipped')
736 dn = os.path.dirname(filename)
737 if dn != '' and not os.path.exists(dn):
739 except (OSError, IOError), err:
740 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
743 if self.params.get('writedescription', False):
745 descfn = filename + '.description'
746 self.report_writedescription(descfn)
747 descfile = open(descfn, 'wb')
749 descfile.write(info_dict['description'].encode('utf-8'))
752 except (OSError, IOError):
753 self.trouble(u'ERROR: Cannot write description file ' + descfn)
756 if self.params.get('writeinfojson', False):
757 infofn = filename + '.info.json'
758 self.report_writeinfojson(infofn)
761 except (NameError,AttributeError):
762 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
765 infof = open(infofn, 'wb')
767 json.dump(info_dict, infof)
770 except (OSError, IOError):
771 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
774 if not self.params.get('skip_download', False):
776 success,add_data = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
777 info_dict.update(add_data)
778 except (OSError, IOError), err:
779 raise UnavailableVideoError
780 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
781 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
783 except (ContentTooShortError, ), err:
784 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
789 self.post_process(filename, info_dict)
790 except (PostProcessingError), err:
791 self.trouble(u'ERROR: postprocessing: %s' % str(err))
794 def download(self, url_list):
795 """Download a given list of URLs."""
796 if len(url_list) > 1 and self.fixed_template():
797 raise SameFileError(self.params['outtmpl'])
800 suitable_found = False
802 # Go to next InfoExtractor if not suitable
803 if not ie.suitable(url):
806 # Suitable InfoExtractor found
807 suitable_found = True
809 # Extract information from URL and process it
812 # Suitable InfoExtractor had been found; go to next URL
815 if not suitable_found:
816 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
818 return self._download_retcode
820 def post_process(self, filename, ie_info):
821 """Run the postprocessing chain on the given file."""
823 info['filepath'] = filename
829 def _download_with_rtmpdump(self, filename, url, player_url):
830 self.report_destination(filename)
831 tmpfilename = self.temp_name(filename)
833 # Check for rtmpdump first
835 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
836 except (OSError, IOError):
837 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
840 # Download using rtmpdump. rtmpdump returns exit code 2 when
841 # the connection was interrumpted and resuming appears to be
842 # possible. This is part of rtmpdump's normal usage, AFAIK.
843 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
844 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
845 while retval == 2 or retval == 1:
846 prevsize = os.path.getsize(tmpfilename)
847 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
848 time.sleep(5.0) # This seems to be needed
849 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
850 cursize = os.path.getsize(tmpfilename)
851 if prevsize == cursize and retval == 1:
853 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
854 if prevsize == cursize and retval == 2 and cursize > 1024:
855 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
859 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
860 self.try_rename(tmpfilename, filename)
863 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
866 def _do_download(self, filename, url, player_url):
867 # Check file already present
868 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
869 self.report_file_already_downloaded(filename)
872 # Attempt to download using rtmpdump
873 if url.startswith('rtmp'):
874 return self._download_with_rtmpdump(filename, url, player_url)
876 tmpfilename = self.temp_name(filename)
880 # Do not include the Accept-Encoding header
881 headers = {'Youtubedl-no-compression': 'True'}
882 basic_request = urllib2.Request(url, None, headers)
883 request = urllib2.Request(url, None, headers)
885 # Establish possible resume length
886 if os.path.isfile(tmpfilename):
887 resume_len = os.path.getsize(tmpfilename)
891 # Request parameters in case of being able to resume
892 if self.params.get('continuedl', False) and resume_len != 0:
893 self.report_resuming_byte(resume_len)
894 request.add_header('Range', 'bytes=%d-' % resume_len)
898 retries = self.params.get('retries', 0)
899 while count <= retries:
900 # Establish connection
902 data = urllib2.urlopen(request)
904 except (urllib2.HTTPError, ), err:
905 if (err.code < 500 or err.code >= 600) and err.code != 416:
906 # Unexpected HTTP error
908 elif err.code == 416:
909 # Unable to resume (requested range not satisfiable)
911 # Open the connection again without the range header
912 data = urllib2.urlopen(basic_request)
913 content_length = data.info()['Content-Length']
914 except (urllib2.HTTPError, ), err:
915 if err.code < 500 or err.code >= 600:
918 # Examine the reported length
919 if (content_length is not None and
920 (resume_len - 100 < long(content_length) < resume_len + 100)):
921 # The file had already been fully downloaded.
922 # Explanation to the above condition: in issue #175 it was revealed that
923 # YouTube sometimes adds or removes a few bytes from the end of the file,
924 # changing the file size slightly and causing problems for some users. So
925 # I decided to implement a suggested change and consider the file
926 # completely downloaded if the file size differs less than 100 bytes from
927 # the one in the hard drive.
928 self.report_file_already_downloaded(filename)
929 self.try_rename(tmpfilename, filename)
932 # The length does not match, we start the download over
933 self.report_unable_to_resume()
939 self.report_retry(count, retries)
942 self.trouble(u'ERROR: giving up after %s retries' % retries)
945 data_len = data.info().get('Content-length', None)
946 if data_len is not None:
947 data_len = long(data_len) + resume_len
948 data_len_str = self.format_bytes(data_len)
949 byte_counter = 0 + resume_len
955 data_block = data.read(block_size)
957 if len(data_block) == 0:
959 byte_counter += len(data_block)
961 # Open file just in time
964 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
965 assert stream is not None
966 filename = self.undo_temp_name(tmpfilename)
967 self.report_destination(filename)
968 except (OSError, IOError), err:
969 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
972 stream.write(data_block)
973 except (IOError, OSError), err:
974 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
976 block_size = self.best_block_size(after - before, len(data_block))
979 percent_str = self.calc_percent(byte_counter, data_len)
980 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
981 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
982 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
985 self.slow_down(start, byte_counter - resume_len)
988 self.trouble(u'\nERROR: Did not get any data blocks')
992 if data_len is not None and byte_counter != data_len:
993 raise ContentTooShortError(byte_counter, long(data_len))
994 self.try_rename(tmpfilename, filename)
996 # Update file modification time
998 if self.params.get('updatetime', True):
999 filetime = self.try_utime(filename, data.info().get('last-modified', None))
1001 return True, {'filetime': filetime}
1004 class InfoExtractor(object):
1005 """Information Extractor class.
1007 Information extractors are the classes that, given a URL, extract
1008 information from the video (or videos) the URL refers to. This
1009 information includes the real video URL, the video title and simplified
1010 title, author and others. The information is stored in a dictionary
1011 which is then passed to the FileDownloader. The FileDownloader
1012 processes this information possibly downloading the video to the file
1013 system, among other possible outcomes. The dictionaries must include
1014 the following fields:
1016 id: Video identifier.
1017 url: Final video URL.
1018 uploader: Nickname of the video uploader.
1019 title: Literal title.
1020 stitle: Simplified title.
1021 ext: Video filename extension.
1022 format: Video format.
1023 player_url: SWF Player URL (may be None).
1025 The following fields are optional. Their primary purpose is to allow
1026 youtube-dl to serve as the backend for a video search function, such
1027 as the one in youtube2mp3. They are only used when their respective
1028 forced printing functions are called:
1030 thumbnail: Full URL to a video thumbnail image.
1031 description: One-line video description.
1033 Subclasses of this one should re-define the _real_initialize() and
1034 _real_extract() methods and define a _VALID_URL regexp.
1035 Probably, they should also be added to the list of extractors.
1041 def __init__(self, downloader=None):
1042 """Constructor. Receives an optional downloader."""
1044 self.set_downloader(downloader)
1046 def suitable(self, url):
1047 """Receives a URL and returns True if suitable for this IE."""
1048 return re.match(self._VALID_URL, url) is not None
1050 def initialize(self):
1051 """Initializes an instance (authentication, etc)."""
1053 self._real_initialize()
1056 def extract(self, url):
1057 """Extracts URL information and returns it in list of dicts."""
1059 return self._real_extract(url)
1061 def set_downloader(self, downloader):
1062 """Sets the downloader for this IE."""
1063 self._downloader = downloader
1065 def _real_initialize(self):
1066 """Real initialization process. Redefine in subclasses."""
1069 def _real_extract(self, url):
1070 """Real extraction process. Redefine in subclasses."""
1074 class YoutubeIE(InfoExtractor):
1075 """Information extractor for youtube.com."""
1077 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1078 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1079 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1080 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1081 _NETRC_MACHINE = 'youtube'
1082 # Listed in order of quality
1083 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1084 _video_extensions = {
1090 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1094 IE_NAME = u'youtube'
1096 def report_lang(self):
1097 """Report attempt to set language."""
1098 self._downloader.to_screen(u'[youtube] Setting language')
1100 def report_login(self):
1101 """Report attempt to log in."""
1102 self._downloader.to_screen(u'[youtube] Logging in')
1104 def report_age_confirmation(self):
1105 """Report attempt to confirm age."""
1106 self._downloader.to_screen(u'[youtube] Confirming age')
1108 def report_video_webpage_download(self, video_id):
1109 """Report attempt to download video webpage."""
1110 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1112 def report_video_info_webpage_download(self, video_id):
1113 """Report attempt to download video info webpage."""
1114 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1116 def report_information_extraction(self, video_id):
1117 """Report attempt to extract video information."""
1118 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1120 def report_unavailable_format(self, video_id, format):
1121 """Report extracted video URL."""
1122 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1124 def report_rtmp_download(self):
1125 """Indicate the download will use the RTMP protocol."""
1126 self._downloader.to_screen(u'[youtube] RTMP download detected')
1128 def _real_initialize(self):
1129 if self._downloader is None:
1134 downloader_params = self._downloader.params
1136 # Attempt to use provided username and password or .netrc data
1137 if downloader_params.get('username', None) is not None:
1138 username = downloader_params['username']
1139 password = downloader_params['password']
1140 elif downloader_params.get('usenetrc', False):
1142 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1143 if info is not None:
1147 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1148 except (IOError, netrc.NetrcParseError), err:
1149 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1153 request = urllib2.Request(self._LANG_URL)
1156 urllib2.urlopen(request).read()
1157 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1158 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1161 # No authentication to be performed
1162 if username is None:
1167 'current_form': 'loginForm',
1169 'action_login': 'Log In',
1170 'username': username,
1171 'password': password,
1173 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1176 login_results = urllib2.urlopen(request).read()
1177 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1178 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1187 'action_confirm': 'Confirm',
1189 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1191 self.report_age_confirmation()
1192 age_results = urllib2.urlopen(request).read()
1193 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1197 def _real_extract(self, url):
1198 # Extract video id from URL
1199 mobj = re.match(self._VALID_URL, url)
1201 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1203 video_id = mobj.group(2)
1206 self.report_video_webpage_download(video_id)
1207 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1209 video_webpage = urllib2.urlopen(request).read()
1210 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1214 # Attempt to extract SWF player URL
1215 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1216 if mobj is not None:
1217 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1222 self.report_video_info_webpage_download(video_id)
1223 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1224 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1225 % (video_id, el_type))
1226 request = urllib2.Request(video_info_url)
1228 video_info_webpage = urllib2.urlopen(request).read()
1229 video_info = parse_qs(video_info_webpage)
1230 if 'token' in video_info:
1232 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1233 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1235 if 'token' not in video_info:
1236 if 'reason' in video_info:
1237 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1239 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1242 # Start extracting information
1243 self.report_information_extraction(video_id)
1246 if 'author' not in video_info:
1247 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1249 video_uploader = urllib.unquote_plus(video_info['author'][0])
1252 if 'title' not in video_info:
1253 self._downloader.trouble(u'ERROR: unable to extract video title')
1255 video_title = urllib.unquote_plus(video_info['title'][0])
1256 video_title = video_title.decode('utf-8')
1257 video_title = sanitize_title(video_title)
1260 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1261 simple_title = simple_title.strip(ur'_')
1264 if 'thumbnail_url' not in video_info:
1265 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1266 video_thumbnail = ''
1267 else: # don't panic if we can't find it
1268 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1272 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1273 if mobj is not None:
1274 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1275 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1276 for expression in format_expressions:
1278 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1286 video_description = u'No description available.'
1287 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1288 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1289 if mobj is not None:
1290 video_description = mobj.group(1).decode('utf-8')
1292 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1293 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1294 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1295 # TODO use another parser
1298 video_token = urllib.unquote_plus(video_info['token'][0])
1300 # Decide which formats to download
1301 req_format = self._downloader.params.get('format', None)
1303 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1304 self.report_rtmp_download()
1305 video_url_list = [(None, video_info['conn'][0])]
1306 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1307 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1308 url_data = [parse_qs(uds) for uds in url_data_strs]
1309 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1310 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1312 format_limit = self._downloader.params.get('format_limit', None)
1313 if format_limit is not None and format_limit in self._available_formats:
1314 format_list = self._available_formats[self._available_formats.index(format_limit):]
1316 format_list = self._available_formats
1317 existing_formats = [x for x in format_list if x in url_map]
1318 if len(existing_formats) == 0:
1319 self._downloader.trouble(u'ERROR: no known formats available for video')
1321 if req_format is None:
1322 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1323 elif req_format == 'worst':
1324 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1325 elif req_format == '-1':
1326 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1329 if req_format not in url_map:
1330 self._downloader.trouble(u'ERROR: requested format not available')
1332 video_url_list = [(req_format, url_map[req_format])] # Specific format
1334 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1337 for format_param, video_real_url in video_url_list:
1338 # At this point we have a new video
1339 self._downloader.increment_downloads()
1342 video_extension = self._video_extensions.get(format_param, 'flv')
1345 # Process video information
1346 self._downloader.process_info({
1347 'id': video_id.decode('utf-8'),
1348 'url': video_real_url.decode('utf-8'),
1349 'uploader': video_uploader.decode('utf-8'),
1350 'upload_date': upload_date,
1351 'title': video_title,
1352 'stitle': simple_title,
1353 'ext': video_extension.decode('utf-8'),
1354 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1355 'thumbnail': video_thumbnail.decode('utf-8'),
1356 'description': video_description,
1357 'player_url': player_url,
1359 except UnavailableVideoError, err:
1360 self._downloader.trouble(u'\nERROR: unable to download video')
1363 class MetacafeIE(InfoExtractor):
1364 """Information Extractor for metacafe.com."""
1366 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1367 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1368 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1370 IE_NAME = u'metacafe'
1372 def __init__(self, youtube_ie, downloader=None):
1373 InfoExtractor.__init__(self, downloader)
1374 self._youtube_ie = youtube_ie
1376 def report_disclaimer(self):
1377 """Report disclaimer retrieval."""
1378 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1380 def report_age_confirmation(self):
1381 """Report attempt to confirm age."""
1382 self._downloader.to_screen(u'[metacafe] Confirming age')
1384 def report_download_webpage(self, video_id):
1385 """Report webpage download."""
1386 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1388 def report_extraction(self, video_id):
1389 """Report information extraction."""
1390 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1392 def _real_initialize(self):
1393 # Retrieve disclaimer
1394 request = urllib2.Request(self._DISCLAIMER)
1396 self.report_disclaimer()
1397 disclaimer = urllib2.urlopen(request).read()
1398 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1399 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1405 'submit': "Continue - I'm over 18",
1407 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1409 self.report_age_confirmation()
1410 disclaimer = urllib2.urlopen(request).read()
1411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1415 def _real_extract(self, url):
1416 # Extract id and simplified title from URL
1417 mobj = re.match(self._VALID_URL, url)
1419 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1422 video_id = mobj.group(1)
1424 # Check if video comes from YouTube
1425 mobj2 = re.match(r'^yt-(.*)$', video_id)
1426 if mobj2 is not None:
1427 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1430 # At this point we have a new video
1431 self._downloader.increment_downloads()
1433 simple_title = mobj.group(2).decode('utf-8')
1435 # Retrieve video webpage to extract further information
1436 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1438 self.report_download_webpage(video_id)
1439 webpage = urllib2.urlopen(request).read()
1440 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1441 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1444 # Extract URL, uploader and title from webpage
1445 self.report_extraction(video_id)
1446 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1447 if mobj is not None:
1448 mediaURL = urllib.unquote(mobj.group(1))
1449 video_extension = mediaURL[-3:]
1451 # Extract gdaKey if available
1452 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1454 video_url = mediaURL
1456 gdaKey = mobj.group(1)
1457 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1459 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1461 self._downloader.trouble(u'ERROR: unable to extract media URL')
1463 vardict = parse_qs(mobj.group(1))
1464 if 'mediaData' not in vardict:
1465 self._downloader.trouble(u'ERROR: unable to extract media URL')
1467 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1469 self._downloader.trouble(u'ERROR: unable to extract media URL')
1471 mediaURL = mobj.group(1).replace('\\/', '/')
1472 video_extension = mediaURL[-3:]
1473 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1475 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1477 self._downloader.trouble(u'ERROR: unable to extract title')
1479 video_title = mobj.group(1).decode('utf-8')
1480 video_title = sanitize_title(video_title)
1482 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1484 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1486 video_uploader = mobj.group(1)
1489 # Process video information
1490 self._downloader.process_info({
1491 'id': video_id.decode('utf-8'),
1492 'url': video_url.decode('utf-8'),
1493 'uploader': video_uploader.decode('utf-8'),
1494 'upload_date': u'NA',
1495 'title': video_title,
1496 'stitle': simple_title,
1497 'ext': video_extension.decode('utf-8'),
1501 except UnavailableVideoError:
1502 self._downloader.trouble(u'\nERROR: unable to download video')
1505 class DailymotionIE(InfoExtractor):
1506 """Information Extractor for Dailymotion"""
1508 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1509 IE_NAME = u'dailymotion'
1511 def __init__(self, downloader=None):
1512 InfoExtractor.__init__(self, downloader)
1514 def report_download_webpage(self, video_id):
1515 """Report webpage download."""
1516 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1518 def report_extraction(self, video_id):
1519 """Report information extraction."""
1520 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1522 def _real_initialize(self):
1525 def _real_extract(self, url):
1526 # Extract id and simplified title from URL
1527 mobj = re.match(self._VALID_URL, url)
1529 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1532 # At this point we have a new video
1533 self._downloader.increment_downloads()
1534 video_id = mobj.group(1)
1536 simple_title = mobj.group(2).decode('utf-8')
1537 video_extension = 'flv'
1539 # Retrieve video webpage to extract further information
1540 request = urllib2.Request(url)
1541 request.add_header('Cookie', 'family_filter=off')
1543 self.report_download_webpage(video_id)
1544 webpage = urllib2.urlopen(request).read()
1545 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1546 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1549 # Extract URL, uploader and title from webpage
1550 self.report_extraction(video_id)
1551 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1553 self._downloader.trouble(u'ERROR: unable to extract media URL')
1555 sequence = urllib.unquote(mobj.group(1))
1556 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1558 self._downloader.trouble(u'ERROR: unable to extract media URL')
1560 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1562 # if needed add http://www.dailymotion.com/ if relative URL
1564 video_url = mediaURL
1566 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1568 self._downloader.trouble(u'ERROR: unable to extract title')
1570 video_title = mobj.group(1).decode('utf-8')
1571 video_title = sanitize_title(video_title)
1573 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1575 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1577 video_uploader = mobj.group(1)
1580 # Process video information
1581 self._downloader.process_info({
1582 'id': video_id.decode('utf-8'),
1583 'url': video_url.decode('utf-8'),
1584 'uploader': video_uploader.decode('utf-8'),
1585 'upload_date': u'NA',
1586 'title': video_title,
1587 'stitle': simple_title,
1588 'ext': video_extension.decode('utf-8'),
1592 except UnavailableVideoError:
1593 self._downloader.trouble(u'\nERROR: unable to download video')
1596 class GoogleIE(InfoExtractor):
1597 """Information extractor for video.google.com."""
1599 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1600 IE_NAME = u'video.google'
1602 def __init__(self, downloader=None):
1603 InfoExtractor.__init__(self, downloader)
1605 def report_download_webpage(self, video_id):
1606 """Report webpage download."""
1607 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1609 def report_extraction(self, video_id):
1610 """Report information extraction."""
1611 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1613 def _real_initialize(self):
1616 def _real_extract(self, url):
1617 # Extract id from URL
1618 mobj = re.match(self._VALID_URL, url)
1620 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1623 # At this point we have a new video
1624 self._downloader.increment_downloads()
1625 video_id = mobj.group(1)
1627 video_extension = 'mp4'
1629 # Retrieve video webpage to extract further information
1630 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1632 self.report_download_webpage(video_id)
1633 webpage = urllib2.urlopen(request).read()
1634 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1635 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1638 # Extract URL, uploader, and title from webpage
1639 self.report_extraction(video_id)
1640 mobj = re.search(r"download_url:'([^']+)'", webpage)
1642 video_extension = 'flv'
1643 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1645 self._downloader.trouble(u'ERROR: unable to extract media URL')
1647 mediaURL = urllib.unquote(mobj.group(1))
1648 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1649 mediaURL = mediaURL.replace('\\x26', '\x26')
1651 video_url = mediaURL
1653 mobj = re.search(r'<title>(.*)</title>', webpage)
1655 self._downloader.trouble(u'ERROR: unable to extract title')
1657 video_title = mobj.group(1).decode('utf-8')
1658 video_title = sanitize_title(video_title)
1659 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1661 # Extract video description
1662 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1664 self._downloader.trouble(u'ERROR: unable to extract video description')
1666 video_description = mobj.group(1).decode('utf-8')
1667 if not video_description:
1668 video_description = 'No description available.'
1670 # Extract video thumbnail
1671 if self._downloader.params.get('forcethumbnail', False):
1672 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1674 webpage = urllib2.urlopen(request).read()
1675 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1678 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1680 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1682 video_thumbnail = mobj.group(1)
1683 else: # we need something to pass to process_info
1684 video_thumbnail = ''
1687 # Process video information
1688 self._downloader.process_info({
1689 'id': video_id.decode('utf-8'),
1690 'url': video_url.decode('utf-8'),
1692 'upload_date': u'NA',
1693 'title': video_title,
1694 'stitle': simple_title,
1695 'ext': video_extension.decode('utf-8'),
1699 except UnavailableVideoError:
1700 self._downloader.trouble(u'\nERROR: unable to download video')
1703 class PhotobucketIE(InfoExtractor):
1704 """Information extractor for photobucket.com."""
1706 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1707 IE_NAME = u'photobucket'
1709 def __init__(self, downloader=None):
1710 InfoExtractor.__init__(self, downloader)
1712 def report_download_webpage(self, video_id):
1713 """Report webpage download."""
1714 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1716 def report_extraction(self, video_id):
1717 """Report information extraction."""
1718 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1720 def _real_initialize(self):
1723 def _real_extract(self, url):
1724 # Extract id from URL
1725 mobj = re.match(self._VALID_URL, url)
1727 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1730 # At this point we have a new video
1731 self._downloader.increment_downloads()
1732 video_id = mobj.group(1)
1734 video_extension = 'flv'
1736 # Retrieve video webpage to extract further information
1737 request = urllib2.Request(url)
1739 self.report_download_webpage(video_id)
1740 webpage = urllib2.urlopen(request).read()
1741 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1742 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1745 # Extract URL, uploader, and title from webpage
1746 self.report_extraction(video_id)
1747 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1749 self._downloader.trouble(u'ERROR: unable to extract media URL')
1751 mediaURL = urllib.unquote(mobj.group(1))
1753 video_url = mediaURL
1755 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1757 self._downloader.trouble(u'ERROR: unable to extract title')
1759 video_title = mobj.group(1).decode('utf-8')
1760 video_title = sanitize_title(video_title)
1761 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1763 video_uploader = mobj.group(2).decode('utf-8')
1766 # Process video information
1767 self._downloader.process_info({
1768 'id': video_id.decode('utf-8'),
1769 'url': video_url.decode('utf-8'),
1770 'uploader': video_uploader,
1771 'upload_date': u'NA',
1772 'title': video_title,
1773 'stitle': simple_title,
1774 'ext': video_extension.decode('utf-8'),
1778 except UnavailableVideoError:
1779 self._downloader.trouble(u'\nERROR: unable to download video')
1782 class YahooIE(InfoExtractor):
1783 """Information extractor for video.yahoo.com."""
1785 # _VALID_URL matches all Yahoo! Video URLs
1786 # _VPAGE_URL matches only the extractable '/watch/' URLs
1787 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1788 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1789 IE_NAME = u'video.yahoo'
1791 def __init__(self, downloader=None):
1792 InfoExtractor.__init__(self, downloader)
1794 def report_download_webpage(self, video_id):
1795 """Report webpage download."""
1796 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1798 def report_extraction(self, video_id):
1799 """Report information extraction."""
1800 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1802 def _real_initialize(self):
1805 def _real_extract(self, url, new_video=True):
1806 # Extract ID from URL
1807 mobj = re.match(self._VALID_URL, url)
1809 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1812 # At this point we have a new video
1813 self._downloader.increment_downloads()
1814 video_id = mobj.group(2)
1815 video_extension = 'flv'
1817 # Rewrite valid but non-extractable URLs as
1818 # extractable English language /watch/ URLs
1819 if re.match(self._VPAGE_URL, url) is None:
1820 request = urllib2.Request(url)
1822 webpage = urllib2.urlopen(request).read()
1823 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1824 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1827 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1829 self._downloader.trouble(u'ERROR: Unable to extract id field')
1831 yahoo_id = mobj.group(1)
1833 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1835 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1837 yahoo_vid = mobj.group(1)
1839 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1840 return self._real_extract(url, new_video=False)
1842 # Retrieve video webpage to extract further information
1843 request = urllib2.Request(url)
1845 self.report_download_webpage(video_id)
1846 webpage = urllib2.urlopen(request).read()
1847 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1848 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1851 # Extract uploader and title from webpage
1852 self.report_extraction(video_id)
1853 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1855 self._downloader.trouble(u'ERROR: unable to extract video title')
1857 video_title = mobj.group(1).decode('utf-8')
1858 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1860 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1862 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1864 video_uploader = mobj.group(1).decode('utf-8')
1866 # Extract video thumbnail
1867 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1869 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1871 video_thumbnail = mobj.group(1).decode('utf-8')
1873 # Extract video description
1874 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1876 self._downloader.trouble(u'ERROR: unable to extract video description')
1878 video_description = mobj.group(1).decode('utf-8')
1879 if not video_description:
1880 video_description = 'No description available.'
1882 # Extract video height and width
1883 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1885 self._downloader.trouble(u'ERROR: unable to extract video height')
1887 yv_video_height = mobj.group(1)
1889 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1891 self._downloader.trouble(u'ERROR: unable to extract video width')
1893 yv_video_width = mobj.group(1)
1895 # Retrieve video playlist to extract media URL
1896 # I'm not completely sure what all these options are, but we
1897 # seem to need most of them, otherwise the server sends a 401.
1898 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1899 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1900 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1901 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1902 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1904 self.report_download_webpage(video_id)
1905 webpage = urllib2.urlopen(request).read()
1906 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1907 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1910 # Extract media URL from playlist XML
1911 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1913 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1915 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1916 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1919 # Process video information
1920 self._downloader.process_info({
1921 'id': video_id.decode('utf-8'),
1923 'uploader': video_uploader,
1924 'upload_date': u'NA',
1925 'title': video_title,
1926 'stitle': simple_title,
1927 'ext': video_extension.decode('utf-8'),
1928 'thumbnail': video_thumbnail.decode('utf-8'),
1929 'description': video_description,
1930 'thumbnail': video_thumbnail,
1933 except UnavailableVideoError:
1934 self._downloader.trouble(u'\nERROR: unable to download video')
1937 class VimeoIE(InfoExtractor):
1938 """Information extractor for vimeo.com."""
1940 # _VALID_URL matches Vimeo URLs
1941 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1944 def __init__(self, downloader=None):
1945 InfoExtractor.__init__(self, downloader)
1947 def report_download_webpage(self, video_id):
1948 """Report webpage download."""
1949 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1951 def report_extraction(self, video_id):
1952 """Report information extraction."""
1953 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1955 def _real_initialize(self):
1958 def _real_extract(self, url, new_video=True):
1959 # Extract ID from URL
1960 mobj = re.match(self._VALID_URL, url)
1962 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1965 # At this point we have a new video
1966 self._downloader.increment_downloads()
1967 video_id = mobj.group(1)
1969 # Retrieve video webpage to extract further information
1970 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1972 self.report_download_webpage(video_id)
1973 webpage = urllib2.urlopen(request).read()
1974 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1975 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1978 # Now we begin extracting as much information as we can from what we
1979 # retrieved. First we extract the information common to all extractors,
1980 # and latter we extract those that are Vimeo specific.
1981 self.report_extraction(video_id)
1984 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1986 self._downloader.trouble(u'ERROR: unable to extract video title')
1988 video_title = mobj.group(1).decode('utf-8')
1989 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1992 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1994 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1996 video_uploader = mobj.group(1).decode('utf-8')
1998 # Extract video thumbnail
1999 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2001 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2003 video_thumbnail = mobj.group(1).decode('utf-8')
2005 # # Extract video description
2006 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2008 # self._downloader.trouble(u'ERROR: unable to extract video description')
2010 # video_description = mobj.group(1).decode('utf-8')
2011 # if not video_description: video_description = 'No description available.'
2012 video_description = 'Foo.'
2014 # Vimeo specific: extract request signature
2015 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2017 self._downloader.trouble(u'ERROR: unable to extract request signature')
2019 sig = mobj.group(1).decode('utf-8')
2021 # Vimeo specific: Extract request signature expiration
2022 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2024 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2026 sig_exp = mobj.group(1).decode('utf-8')
2028 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2031 # Process video information
2032 self._downloader.process_info({
2033 'id': video_id.decode('utf-8'),
2035 'uploader': video_uploader,
2036 'upload_date': u'NA',
2037 'title': video_title,
2038 'stitle': simple_title,
2040 'thumbnail': video_thumbnail.decode('utf-8'),
2041 'description': video_description,
2042 'thumbnail': video_thumbnail,
2043 'description': video_description,
2046 except UnavailableVideoError:
2047 self._downloader.trouble(u'ERROR: unable to download video')
2050 class GenericIE(InfoExtractor):
2051 """Generic last-resort information extractor."""
2054 IE_NAME = u'generic'
2056 def __init__(self, downloader=None):
2057 InfoExtractor.__init__(self, downloader)
2059 def report_download_webpage(self, video_id):
2060 """Report webpage download."""
2061 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2062 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2064 def report_extraction(self, video_id):
2065 """Report information extraction."""
2066 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2068 def _real_initialize(self):
2071 def _real_extract(self, url):
2072 # At this point we have a new video
2073 self._downloader.increment_downloads()
2075 video_id = url.split('/')[-1]
2076 request = urllib2.Request(url)
2078 self.report_download_webpage(video_id)
2079 webpage = urllib2.urlopen(request).read()
2080 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2081 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2083 except ValueError, err:
2084 # since this is the last-resort InfoExtractor, if
2085 # this error is thrown, it'll be thrown here
2086 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2089 self.report_extraction(video_id)
2090 # Start with something easy: JW Player in SWFObject
2091 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2093 # Broaden the search a little bit
2094 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2096 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2099 # It's possible that one of the regexes
2100 # matched, but returned an empty group:
2101 if mobj.group(1) is None:
2102 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2105 video_url = urllib.unquote(mobj.group(1))
2106 video_id = os.path.basename(video_url)
2108 # here's a fun little line of code for you:
2109 video_extension = os.path.splitext(video_id)[1][1:]
2110 video_id = os.path.splitext(video_id)[0]
2112 # it's tempting to parse this further, but you would
2113 # have to take into account all the variations like
2114 # Video Title - Site Name
2115 # Site Name | Video Title
2116 # Video Title - Tagline | Site Name
2117 # and so on and so forth; it's just not practical
2118 mobj = re.search(r'<title>(.*)</title>', webpage)
2120 self._downloader.trouble(u'ERROR: unable to extract title')
2122 video_title = mobj.group(1).decode('utf-8')
2123 video_title = sanitize_title(video_title)
2124 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2126 # video uploader is domain name
2127 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2129 self._downloader.trouble(u'ERROR: unable to extract title')
2131 video_uploader = mobj.group(1).decode('utf-8')
2134 # Process video information
2135 self._downloader.process_info({
2136 'id': video_id.decode('utf-8'),
2137 'url': video_url.decode('utf-8'),
2138 'uploader': video_uploader,
2139 'upload_date': u'NA',
2140 'title': video_title,
2141 'stitle': simple_title,
2142 'ext': video_extension.decode('utf-8'),
2146 except UnavailableVideoError, err:
2147 self._downloader.trouble(u'\nERROR: unable to download video')
2150 class YoutubeSearchIE(InfoExtractor):
2151 """Information Extractor for YouTube search queries."""
2152 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2153 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2154 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2155 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2157 _max_youtube_results = 1000
2158 IE_NAME = u'youtube:search'
2160 def __init__(self, youtube_ie, downloader=None):
2161 InfoExtractor.__init__(self, downloader)
2162 self._youtube_ie = youtube_ie
2164 def report_download_page(self, query, pagenum):
2165 """Report attempt to download playlist page with given number."""
2166 query = query.decode(preferredencoding())
2167 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2169 def _real_initialize(self):
2170 self._youtube_ie.initialize()
2172 def _real_extract(self, query):
2173 mobj = re.match(self._VALID_URL, query)
2175 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2178 prefix, query = query.split(':')
2180 query = query.encode('utf-8')
2182 self._download_n_results(query, 1)
2184 elif prefix == 'all':
2185 self._download_n_results(query, self._max_youtube_results)
2191 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2193 elif n > self._max_youtube_results:
2194 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2195 n = self._max_youtube_results
2196 self._download_n_results(query, n)
2198 except ValueError: # parsing prefix as integer fails
2199 self._download_n_results(query, 1)
2202 def _download_n_results(self, query, n):
2203 """Downloads a specified number of results for a query"""
2206 already_seen = set()
2210 self.report_download_page(query, pagenum)
2211 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2212 request = urllib2.Request(result_url)
2214 page = urllib2.urlopen(request).read()
2215 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2216 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2219 # Extract video identifiers
2220 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2221 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2222 if video_id not in already_seen:
2223 video_ids.append(video_id)
2224 already_seen.add(video_id)
2225 if len(video_ids) == n:
2226 # Specified n videos reached
2227 for id in video_ids:
2228 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2231 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2232 for id in video_ids:
2233 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2236 pagenum = pagenum + 1
2239 class GoogleSearchIE(InfoExtractor):
2240 """Information Extractor for Google Video search queries."""
2241 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2242 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2243 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2244 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2246 _max_google_results = 1000
2247 IE_NAME = u'video.google:search'
2249 def __init__(self, google_ie, downloader=None):
2250 InfoExtractor.__init__(self, downloader)
2251 self._google_ie = google_ie
2253 def report_download_page(self, query, pagenum):
2254 """Report attempt to download playlist page with given number."""
2255 query = query.decode(preferredencoding())
2256 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2258 def _real_initialize(self):
2259 self._google_ie.initialize()
2261 def _real_extract(self, query):
2262 mobj = re.match(self._VALID_URL, query)
2264 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2267 prefix, query = query.split(':')
2269 query = query.encode('utf-8')
2271 self._download_n_results(query, 1)
2273 elif prefix == 'all':
2274 self._download_n_results(query, self._max_google_results)
2280 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2282 elif n > self._max_google_results:
2283 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2284 n = self._max_google_results
2285 self._download_n_results(query, n)
2287 except ValueError: # parsing prefix as integer fails
2288 self._download_n_results(query, 1)
2291 def _download_n_results(self, query, n):
2292 """Downloads a specified number of results for a query"""
2295 already_seen = set()
2299 self.report_download_page(query, pagenum)
2300 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2301 request = urllib2.Request(result_url)
2303 page = urllib2.urlopen(request).read()
2304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2305 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2308 # Extract video identifiers
2309 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2310 video_id = mobj.group(1)
2311 if video_id not in already_seen:
2312 video_ids.append(video_id)
2313 already_seen.add(video_id)
2314 if len(video_ids) == n:
2315 # Specified n videos reached
2316 for id in video_ids:
2317 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2320 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2321 for id in video_ids:
2322 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2325 pagenum = pagenum + 1
2328 class YahooSearchIE(InfoExtractor):
2329 """Information Extractor for Yahoo! Video search queries."""
2330 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2331 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2332 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2333 _MORE_PAGES_INDICATOR = r'\s*Next'
2335 _max_yahoo_results = 1000
2336 IE_NAME = u'video.yahoo:search'
2338 def __init__(self, yahoo_ie, downloader=None):
2339 InfoExtractor.__init__(self, downloader)
2340 self._yahoo_ie = yahoo_ie
2342 def report_download_page(self, query, pagenum):
2343 """Report attempt to download playlist page with given number."""
2344 query = query.decode(preferredencoding())
2345 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2347 def _real_initialize(self):
2348 self._yahoo_ie.initialize()
2350 def _real_extract(self, query):
2351 mobj = re.match(self._VALID_URL, query)
2353 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2356 prefix, query = query.split(':')
2358 query = query.encode('utf-8')
2360 self._download_n_results(query, 1)
2362 elif prefix == 'all':
2363 self._download_n_results(query, self._max_yahoo_results)
2369 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2371 elif n > self._max_yahoo_results:
2372 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2373 n = self._max_yahoo_results
2374 self._download_n_results(query, n)
2376 except ValueError: # parsing prefix as integer fails
2377 self._download_n_results(query, 1)
2380 def _download_n_results(self, query, n):
2381 """Downloads a specified number of results for a query"""
2384 already_seen = set()
2388 self.report_download_page(query, pagenum)
2389 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2390 request = urllib2.Request(result_url)
2392 page = urllib2.urlopen(request).read()
2393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2394 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2397 # Extract video identifiers
2398 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2399 video_id = mobj.group(1)
2400 if video_id not in already_seen:
2401 video_ids.append(video_id)
2402 already_seen.add(video_id)
2403 if len(video_ids) == n:
2404 # Specified n videos reached
2405 for id in video_ids:
2406 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2409 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2410 for id in video_ids:
2411 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2414 pagenum = pagenum + 1
2417 class YoutubePlaylistIE(InfoExtractor):
2418 """Information Extractor for YouTube playlists."""
2420 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2421 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2422 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2423 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2425 IE_NAME = u'youtube:playlist'
2427 def __init__(self, youtube_ie, downloader=None):
2428 InfoExtractor.__init__(self, downloader)
2429 self._youtube_ie = youtube_ie
2431 def report_download_page(self, playlist_id, pagenum):
2432 """Report attempt to download playlist page with given number."""
2433 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2435 def _real_initialize(self):
2436 self._youtube_ie.initialize()
2438 def _real_extract(self, url):
2439 # Extract playlist id
2440 mobj = re.match(self._VALID_URL, url)
2442 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2446 if mobj.group(3) is not None:
2447 self._youtube_ie.extract(mobj.group(3))
2450 # Download playlist pages
2451 # prefix is 'p' as default for playlists but there are other types that need extra care
2452 playlist_prefix = mobj.group(1)
2453 if playlist_prefix == 'a':
2454 playlist_access = 'artist'
2456 playlist_prefix = 'p'
2457 playlist_access = 'view_play_list'
2458 playlist_id = mobj.group(2)
2463 self.report_download_page(playlist_id, pagenum)
2464 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2466 page = urllib2.urlopen(request).read()
2467 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2471 # Extract video identifiers
2473 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2474 if mobj.group(1) not in ids_in_page:
2475 ids_in_page.append(mobj.group(1))
2476 video_ids.extend(ids_in_page)
2478 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2480 pagenum = pagenum + 1
2482 playliststart = self._downloader.params.get('playliststart', 1) - 1
2483 playlistend = self._downloader.params.get('playlistend', -1)
2484 video_ids = video_ids[playliststart:playlistend]
2486 for id in video_ids:
2487 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2491 class YoutubeUserIE(InfoExtractor):
2492 """Information Extractor for YouTube users."""
2494 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2495 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2496 _GDATA_PAGE_SIZE = 50
2497 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2498 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2500 IE_NAME = u'youtube:user'
2502 def __init__(self, youtube_ie, downloader=None):
2503 InfoExtractor.__init__(self, downloader)
2504 self._youtube_ie = youtube_ie
2506 def report_download_page(self, username, start_index):
2507 """Report attempt to download user page."""
2508 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2509 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2511 def _real_initialize(self):
2512 self._youtube_ie.initialize()
2514 def _real_extract(self, url):
2516 mobj = re.match(self._VALID_URL, url)
2518 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2521 username = mobj.group(1)
2523 # Download video ids using YouTube Data API. Result size per
2524 # query is limited (currently to 50 videos) so we need to query
2525 # page by page until there are no video ids - it means we got
2532 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2533 self.report_download_page(username, start_index)
2535 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2538 page = urllib2.urlopen(request).read()
2539 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2540 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2543 # Extract video identifiers
2546 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2547 if mobj.group(1) not in ids_in_page:
2548 ids_in_page.append(mobj.group(1))
2550 video_ids.extend(ids_in_page)
2552 # A little optimization - if current page is not
2553 # "full", ie. does not contain PAGE_SIZE video ids then
2554 # we can assume that this page is the last one - there
2555 # are no more ids on further pages - no need to query
2558 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2563 all_ids_count = len(video_ids)
2564 playliststart = self._downloader.params.get('playliststart', 1) - 1
2565 playlistend = self._downloader.params.get('playlistend', -1)
2567 if playlistend == -1:
2568 video_ids = video_ids[playliststart:]
2570 video_ids = video_ids[playliststart:playlistend]
2572 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2573 (username, all_ids_count, len(video_ids)))
2575 for video_id in video_ids:
2576 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2579 class DepositFilesIE(InfoExtractor):
2580 """Information extractor for depositfiles.com"""
2582 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2583 IE_NAME = u'DepositFiles'
2585 def __init__(self, downloader=None):
2586 InfoExtractor.__init__(self, downloader)
2588 def report_download_webpage(self, file_id):
2589 """Report webpage download."""
2590 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2592 def report_extraction(self, file_id):
2593 """Report information extraction."""
2594 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2596 def _real_initialize(self):
2599 def _real_extract(self, url):
2600 # At this point we have a new file
2601 self._downloader.increment_downloads()
2603 file_id = url.split('/')[-1]
2604 # Rebuild url in english locale
2605 url = 'http://depositfiles.com/en/files/' + file_id
2607 # Retrieve file webpage with 'Free download' button pressed
2608 free_download_indication = { 'gateway_result' : '1' }
2609 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2611 self.report_download_webpage(file_id)
2612 webpage = urllib2.urlopen(request).read()
2613 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2614 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2617 # Search for the real file URL
2618 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2619 if (mobj is None) or (mobj.group(1) is None):
2620 # Try to figure out reason of the error.
2621 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2622 if (mobj is not None) and (mobj.group(1) is not None):
2623 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2624 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2626 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2629 file_url = mobj.group(1)
2630 file_extension = os.path.splitext(file_url)[1][1:]
2632 # Search for file title
2633 mobj = re.search(r'<b title="(.*?)">', webpage)
2635 self._downloader.trouble(u'ERROR: unable to extract title')
2637 file_title = mobj.group(1).decode('utf-8')
2640 # Process file information
2641 self._downloader.process_info({
2642 'id': file_id.decode('utf-8'),
2643 'url': file_url.decode('utf-8'),
2645 'upload_date': u'NA',
2646 'title': file_title,
2647 'stitle': file_title,
2648 'ext': file_extension.decode('utf-8'),
2652 except UnavailableVideoError, err:
2653 self._downloader.trouble(u'ERROR: unable to download file')
2656 class FacebookIE(InfoExtractor):
2657 """Information Extractor for Facebook"""
2659 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2660 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2661 _NETRC_MACHINE = 'facebook'
2662 _available_formats = ['highqual', 'lowqual']
2663 _video_extensions = {
2667 IE_NAME = u'facebook'
2669 def __init__(self, downloader=None):
2670 InfoExtractor.__init__(self, downloader)
2672 def _reporter(self, message):
2673 """Add header and report message."""
2674 self._downloader.to_screen(u'[facebook] %s' % message)
2676 def report_login(self):
2677 """Report attempt to log in."""
2678 self._reporter(u'Logging in')
2680 def report_video_webpage_download(self, video_id):
2681 """Report attempt to download video webpage."""
2682 self._reporter(u'%s: Downloading video webpage' % video_id)
2684 def report_information_extraction(self, video_id):
2685 """Report attempt to extract video information."""
2686 self._reporter(u'%s: Extracting video information' % video_id)
2688 def _parse_page(self, video_webpage):
2689 """Extract video information from page"""
2691 data = {'title': r'class="video_title datawrap">(.*?)</',
2692 'description': r'<div class="datawrap">(.*?)</div>',
2693 'owner': r'\("video_owner_name", "(.*?)"\)',
2694 'upload_date': r'data-date="(.*?)"',
2695 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2698 for piece in data.keys():
2699 mobj = re.search(data[piece], video_webpage)
2700 if mobj is not None:
2701 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2705 for fmt in self._available_formats:
2706 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2707 if mobj is not None:
2708 # URL is in a Javascript segment inside an escaped Unicode format within
2709 # the generally utf-8 page
2710 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2711 video_info['video_urls'] = video_urls
2715 def _real_initialize(self):
2716 if self._downloader is None:
2721 downloader_params = self._downloader.params
2723 # Attempt to use provided username and password or .netrc data
2724 if downloader_params.get('username', None) is not None:
2725 useremail = downloader_params['username']
2726 password = downloader_params['password']
2727 elif downloader_params.get('usenetrc', False):
2729 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2730 if info is not None:
2734 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2735 except (IOError, netrc.NetrcParseError), err:
2736 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2739 if useremail is None:
2748 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2751 login_results = urllib2.urlopen(request).read()
2752 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2753 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2755 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2756 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2759 def _real_extract(self, url):
2760 mobj = re.match(self._VALID_URL, url)
2762 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2764 video_id = mobj.group('ID')
2767 self.report_video_webpage_download(video_id)
2768 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2770 page = urllib2.urlopen(request)
2771 video_webpage = page.read()
2772 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2773 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2776 # Start extracting information
2777 self.report_information_extraction(video_id)
2779 # Extract information
2780 video_info = self._parse_page(video_webpage)
2783 if 'owner' not in video_info:
2784 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2786 video_uploader = video_info['owner']
2789 if 'title' not in video_info:
2790 self._downloader.trouble(u'ERROR: unable to extract video title')
2792 video_title = video_info['title']
2793 video_title = video_title.decode('utf-8')
2794 video_title = sanitize_title(video_title)
2797 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2798 simple_title = simple_title.strip(ur'_')
2801 if 'thumbnail' not in video_info:
2802 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2803 video_thumbnail = ''
2805 video_thumbnail = video_info['thumbnail']
2809 if 'upload_date' in video_info:
2810 upload_time = video_info['upload_date']
2811 timetuple = email.utils.parsedate_tz(upload_time)
2812 if timetuple is not None:
2814 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2819 video_description = video_info.get('description', 'No description available.')
2821 url_map = video_info['video_urls']
2822 if len(url_map.keys()) > 0:
2823 # Decide which formats to download
2824 req_format = self._downloader.params.get('format', None)
2825 format_limit = self._downloader.params.get('format_limit', None)
2827 if format_limit is not None and format_limit in self._available_formats:
2828 format_list = self._available_formats[self._available_formats.index(format_limit):]
2830 format_list = self._available_formats
2831 existing_formats = [x for x in format_list if x in url_map]
2832 if len(existing_formats) == 0:
2833 self._downloader.trouble(u'ERROR: no known formats available for video')
2835 if req_format is None:
2836 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2837 elif req_format == 'worst':
2838 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2839 elif req_format == '-1':
2840 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2843 if req_format not in url_map:
2844 self._downloader.trouble(u'ERROR: requested format not available')
2846 video_url_list = [(req_format, url_map[req_format])] # Specific format
2848 for format_param, video_real_url in video_url_list:
2850 # At this point we have a new video
2851 self._downloader.increment_downloads()
2854 video_extension = self._video_extensions.get(format_param, 'mp4')
2857 # Process video information
2858 self._downloader.process_info({
2859 'id': video_id.decode('utf-8'),
2860 'url': video_real_url.decode('utf-8'),
2861 'uploader': video_uploader.decode('utf-8'),
2862 'upload_date': upload_date,
2863 'title': video_title,
2864 'stitle': simple_title,
2865 'ext': video_extension.decode('utf-8'),
2866 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2867 'thumbnail': video_thumbnail.decode('utf-8'),
2868 'description': video_description.decode('utf-8'),
2871 except UnavailableVideoError, err:
2872 self._downloader.trouble(u'\nERROR: unable to download video')
2874 class BlipTVIE(InfoExtractor):
2875 """Information extractor for blip.tv"""
2877 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2878 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2879 IE_NAME = u'blip.tv'
2881 def report_extraction(self, file_id):
2882 """Report information extraction."""
2883 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2885 def _simplify_title(self, title):
2886 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2887 res = res.strip(ur'_')
2890 def _real_extract(self, url):
2891 mobj = re.match(self._VALID_URL, url)
2893 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2900 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2901 request = urllib2.Request(json_url)
2902 self.report_extraction(mobj.group(1))
2904 json_code = urllib2.urlopen(request).read()
2905 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2906 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2909 json_data = json.loads(json_code)
2910 if 'Post' in json_data:
2911 data = json_data['Post']
2915 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2916 video_url = data['media']['url']
2917 umobj = re.match(self._URL_EXT, video_url)
2919 raise ValueError('Can not determine filename extension')
2920 ext = umobj.group(1)
2922 self._downloader.increment_downloads()
2925 'id': data['item_id'],
2927 'uploader': data['display_name'],
2928 'upload_date': upload_date,
2929 'title': data['title'],
2930 'stitle': self._simplify_title(data['title']),
2932 'format': data['media']['mimeType'],
2933 'thumbnail': data['thumbnailUrl'],
2934 'description': data['description'],
2935 'player_url': data['embedUrl']
2937 except (ValueError,KeyError), err:
2938 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2942 self._downloader.process_info(info)
2943 except UnavailableVideoError, err:
2944 self._downloader.trouble(u'\nERROR: unable to download video')
2947 class MyVideoIE(InfoExtractor):
2948 """Information Extractor for myvideo.de."""
2950 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2951 IE_NAME = u'myvideo'
2953 def __init__(self, downloader=None):
2954 InfoExtractor.__init__(self, downloader)
2956 def report_download_webpage(self, video_id):
2957 """Report webpage download."""
2958 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2960 def report_extraction(self, video_id):
2961 """Report information extraction."""
2962 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2964 def _real_initialize(self):
2967 def _real_extract(self,url):
2968 mobj = re.match(self._VALID_URL, url)
2970 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2973 video_id = mobj.group(1)
2974 simple_title = mobj.group(2).decode('utf-8')
2975 # should actually not be necessary
2976 simple_title = sanitize_title(simple_title)
2977 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2980 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2982 self.report_download_webpage(video_id)
2983 webpage = urllib2.urlopen(request).read()
2984 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2985 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2988 self.report_extraction(video_id)
2989 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2992 self._downloader.trouble(u'ERROR: unable to extract media URL')
2994 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2996 mobj = re.search('<title>([^<]+)</title>', webpage)
2998 self._downloader.trouble(u'ERROR: unable to extract title')
3001 video_title = mobj.group(1)
3002 video_title = sanitize_title(video_title)
3006 self._downloader.process_info({
3010 'upload_date': u'NA',
3011 'title': video_title,
3012 'stitle': simple_title,
3017 except UnavailableVideoError:
3018 self._downloader.trouble(u'\nERROR: Unable to download video')
3020 class ComedyCentralIE(InfoExtractor):
3021 """Information extractor for The Daily Show and Colbert Report """
3023 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3024 IE_NAME = u'comedycentral'
3026 def report_extraction(self, episode_id):
3027 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3029 def report_config_download(self, episode_id):
3030 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3032 def report_index_download(self, episode_id):
3033 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3035 def report_player_url(self, episode_id):
3036 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3038 def _simplify_title(self, title):
3039 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3040 res = res.strip(ur'_')
3043 def _real_extract(self, url):
3044 mobj = re.match(self._VALID_URL, url)
3046 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3049 if mobj.group('shortname'):
3050 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3051 url = 'http://www.thedailyshow.com/full-episodes/'
3053 url = 'http://www.colbertnation.com/full-episodes/'
3054 mobj = re.match(self._VALID_URL, url)
3055 assert mobj is not None
3057 dlNewest = not mobj.group('episode')
3059 epTitle = mobj.group('showname')
3061 epTitle = mobj.group('episode')
3063 req = urllib2.Request(url)
3064 self.report_extraction(epTitle)
3066 htmlHandle = urllib2.urlopen(req)
3067 html = htmlHandle.read()
3068 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3069 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3072 url = htmlHandle.geturl()
3073 mobj = re.match(self._VALID_URL, url)
3075 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3077 if mobj.group('episode') == '':
3078 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3080 epTitle = mobj.group('episode')
3082 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3083 if len(mMovieParams) == 0:
3084 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3087 playerUrl_raw = mMovieParams[0][0]
3088 self.report_player_url(epTitle)
3090 urlHandle = urllib2.urlopen(playerUrl_raw)
3091 playerUrl = urlHandle.geturl()
3092 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3093 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3096 uri = mMovieParams[0][1]
3097 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3098 self.report_index_download(epTitle)
3100 indexXml = urllib2.urlopen(indexUrl).read()
3101 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3102 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3105 idoc = xml.etree.ElementTree.fromstring(indexXml)
3106 itemEls = idoc.findall('.//item')
3107 for itemEl in itemEls:
3108 mediaId = itemEl.findall('./guid')[0].text
3109 shortMediaId = mediaId.split(':')[-1]
3110 showId = mediaId.split(':')[-2].replace('.com', '')
3111 officialTitle = itemEl.findall('./title')[0].text
3112 officialDate = itemEl.findall('./pubDate')[0].text
3114 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3115 urllib.urlencode({'uri': mediaId}))
3116 configReq = urllib2.Request(configUrl)
3117 self.report_config_download(epTitle)
3119 configXml = urllib2.urlopen(configReq).read()
3120 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3121 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3124 cdoc = xml.etree.ElementTree.fromstring(configXml)
3126 for rendition in cdoc.findall('.//rendition'):
3127 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3131 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3134 # For now, just pick the highest bitrate
3135 format,video_url = turls[-1]
3137 self._downloader.increment_downloads()
3139 effTitle = showId + '-' + epTitle
3144 'upload_date': officialDate,
3146 'stitle': self._simplify_title(effTitle),
3150 'description': officialTitle,
3151 'player_url': playerUrl
3155 self._downloader.process_info(info)
3156 except UnavailableVideoError, err:
3157 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3161 class EscapistIE(InfoExtractor):
3162 """Information extractor for The Escapist """
3164 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3165 IE_NAME = u'escapist'
3167 def report_extraction(self, showName):
3168 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3170 def report_config_download(self, showName):
3171 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3173 def _simplify_title(self, title):
3174 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3175 res = res.strip(ur'_')
3178 def _real_extract(self, url):
3179 htmlParser = HTMLParser.HTMLParser()
3181 mobj = re.match(self._VALID_URL, url)
3183 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3185 showName = mobj.group('showname')
3186 videoId = mobj.group('episode')
3188 self.report_extraction(showName)
3190 webPage = urllib2.urlopen(url).read()
3191 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3192 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3195 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3196 description = htmlParser.unescape(descMatch.group(1))
3197 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3198 imgUrl = htmlParser.unescape(imgMatch.group(1))
3199 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3200 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3201 configUrlMatch = re.search('config=(.*)$', playerUrl)
3202 configUrl = urllib2.unquote(configUrlMatch.group(1))
3204 self.report_config_download(showName)
3206 configJSON = urllib2.urlopen(configUrl).read()
3207 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3208 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3211 # Technically, it's JavaScript, not JSON
3212 configJSON = configJSON.replace("'", '"')
3215 config = json.loads(configJSON)
3216 except (ValueError,), err:
3217 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3220 playlist = config['playlist']
3221 videoUrl = playlist[1]['url']
3223 self._downloader.increment_downloads()
3227 'uploader': showName,
3228 'upload_date': None,
3230 'stitle': self._simplify_title(showName),
3233 'thumbnail': imgUrl,
3234 'description': description,
3235 'player_url': playerUrl,
3239 self._downloader.process_info(info)
3240 except UnavailableVideoError, err:
3241 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3245 class PostProcessor(object):
3246 """Post Processor class.
3248 PostProcessor objects can be added to downloaders with their
3249 add_post_processor() method. When the downloader has finished a
3250 successful download, it will take its internal chain of PostProcessors
3251 and start calling the run() method on each one of them, first with
3252 an initial argument and then with the returned value of the previous
3255 The chain will be stopped if one of them ever returns None or the end
3256 of the chain is reached.
3258 PostProcessor objects follow a "mutual registration" process similar
3259 to InfoExtractor objects.
3264 def __init__(self, downloader=None):
3265 self._downloader = downloader
3267 def set_downloader(self, downloader):
3268 """Sets the downloader for this PP."""
3269 self._downloader = downloader
3271 def run(self, information):
3272 """Run the PostProcessor.
3274 The "information" argument is a dictionary like the ones
3275 composed by InfoExtractors. The only difference is that this
3276 one has an extra field called "filepath" that points to the
3279 When this method returns None, the postprocessing chain is
3280 stopped. However, this method may return an information
3281 dictionary that will be passed to the next postprocessing
3282 object in the chain. It can be the one it received after
3283 changing some fields.
3285 In addition, this method may raise a PostProcessingError
3286 exception that will be taken into account by the downloader
3289 return information # by default, do nothing
3292 class FFmpegExtractAudioPP(PostProcessor):
3294 def __init__(self, downloader=None, preferredcodec=None):
3295 PostProcessor.__init__(self, downloader)
3296 if preferredcodec is None:
3297 preferredcodec = 'best'
3298 self._preferredcodec = preferredcodec
3301 def get_audio_codec(path):
3303 cmd = ['ffprobe', '-show_streams', '--', path]
3304 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3305 output = handle.communicate()[0]
3306 if handle.wait() != 0:
3308 except (IOError, OSError):
3311 for line in output.split('\n'):
3312 if line.startswith('codec_name='):
3313 audio_codec = line.split('=')[1].strip()
3314 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3319 def run_ffmpeg(path, out_path, codec, more_opts):
3321 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3322 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3324 except (IOError, OSError):
3327 def run(self, information):
3328 path = information['filepath']
3330 filecodec = self.get_audio_codec(path)
3331 if filecodec is None:
3332 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3336 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3337 if filecodec == 'aac' or filecodec == 'mp3':
3338 # Lossless if possible
3340 extension = filecodec
3341 if filecodec == 'aac':
3342 more_opts = ['-f', 'adts']
3345 acodec = 'libmp3lame'
3347 more_opts = ['-ab', '128k']
3349 # We convert the audio (lossy)
3350 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3351 extension = self._preferredcodec
3352 more_opts = ['-ab', '128k']
3353 if self._preferredcodec == 'aac':
3354 more_opts += ['-f', 'adts']
3356 (prefix, ext) = os.path.splitext(path)
3357 new_path = prefix + '.' + extension
3358 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3359 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3362 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3365 # Try to update the date time for extracted audio file.
3366 if information.get('filetime') is not None:
3368 os.utime(new_path, (time.time(), information['filetime']))
3370 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3374 except (IOError, OSError):
3375 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3378 information['filepath'] = new_path
3382 def updateSelf(downloader, filename):
3383 ''' Update the program file with the latest version from the repository '''
3384 # Note: downloader only used for options
3385 if not os.access(filename, os.W_OK):
3386 sys.exit('ERROR: no write permissions on %s' % filename)
3388 downloader.to_screen('Updating to latest version...')
3392 urlh = urllib.urlopen(UPDATE_URL)
3393 newcontent = urlh.read()
3396 except (IOError, OSError), err:
3397 sys.exit('ERROR: unable to download latest version')
3400 outf = open(filename, 'wb')
3402 outf.write(newcontent)
3405 except (IOError, OSError), err:
3406 sys.exit('ERROR: unable to overwrite current version')
3408 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3415 def _format_option_string(option):
3416 ''' ('-o', '--option') -> -o, --format METAVAR'''
3420 if option._short_opts: opts.append(option._short_opts[0])
3421 if option._long_opts: opts.append(option._long_opts[0])
3422 if len(opts) > 1: opts.insert(1, ', ')
3424 if option.takes_value(): opts.append(' %s' % option.metavar)
3426 return "".join(opts)
3428 def _find_term_columns():
3429 columns = os.environ.get('COLUMNS', None)
3434 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3435 out,err = sp.communicate()
3436 return int(out.split()[1])
3442 max_help_position = 80
3444 # No need to wrap help messages if we're on a wide console
3445 columns = _find_term_columns()
3446 if columns: max_width = columns
3448 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3449 fmt.format_option_strings = _format_option_string
3452 'version' : __version__,
3454 'usage' : '%prog [options] url [url...]',
3455 'conflict_handler' : 'resolve',
3458 parser = optparse.OptionParser(**kw)
3461 general = optparse.OptionGroup(parser, 'General Options')
3462 selection = optparse.OptionGroup(parser, 'Video Selection')
3463 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3464 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3465 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3466 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3467 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3469 general.add_option('-h', '--help',
3470 action='help', help='print this help text and exit')
3471 general.add_option('-v', '--version',
3472 action='version', help='print program version and exit')
3473 general.add_option('-U', '--update',
3474 action='store_true', dest='update_self', help='update this program to latest version')
3475 general.add_option('-i', '--ignore-errors',
3476 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3477 general.add_option('-r', '--rate-limit',
3478 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3479 general.add_option('-R', '--retries',
3480 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3481 general.add_option('--dump-user-agent',
3482 action='store_true', dest='dump_user_agent',
3483 help='display the current browser identification', default=False)
3484 general.add_option('--list-extractors',
3485 action='store_true', dest='list_extractors',
3486 help='List all supported extractors and the URLs they would handle', default=False)
3488 selection.add_option('--playlist-start',
3489 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3490 selection.add_option('--playlist-end',
3491 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3492 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3493 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3495 authentication.add_option('-u', '--username',
3496 dest='username', metavar='USERNAME', help='account username')
3497 authentication.add_option('-p', '--password',
3498 dest='password', metavar='PASSWORD', help='account password')
3499 authentication.add_option('-n', '--netrc',
3500 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3503 video_format.add_option('-f', '--format',
3504 action='store', dest='format', metavar='FORMAT', help='video format code')
3505 video_format.add_option('--all-formats',
3506 action='store_const', dest='format', help='download all available video formats', const='-1')
3507 video_format.add_option('--max-quality',
3508 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3511 verbosity.add_option('-q', '--quiet',
3512 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3513 verbosity.add_option('-s', '--simulate',
3514 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3515 verbosity.add_option('--skip-download',
3516 action='store_true', dest='skip_download', help='do not download the video', default=False)
3517 verbosity.add_option('-g', '--get-url',
3518 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3519 verbosity.add_option('-e', '--get-title',
3520 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3521 verbosity.add_option('--get-thumbnail',
3522 action='store_true', dest='getthumbnail',
3523 help='simulate, quiet but print thumbnail URL', default=False)
3524 verbosity.add_option('--get-description',
3525 action='store_true', dest='getdescription',
3526 help='simulate, quiet but print video description', default=False)
3527 verbosity.add_option('--get-filename',
3528 action='store_true', dest='getfilename',
3529 help='simulate, quiet but print output filename', default=False)
3530 verbosity.add_option('--no-progress',
3531 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3532 verbosity.add_option('--console-title',
3533 action='store_true', dest='consoletitle',
3534 help='display progress in console titlebar', default=False)
3537 filesystem.add_option('-t', '--title',
3538 action='store_true', dest='usetitle', help='use title in file name', default=False)
3539 filesystem.add_option('-l', '--literal',
3540 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3541 filesystem.add_option('-A', '--auto-number',
3542 action='store_true', dest='autonumber',
3543 help='number downloaded files starting from 00000', default=False)
3544 filesystem.add_option('-o', '--output',
3545 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3546 filesystem.add_option('-a', '--batch-file',
3547 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3548 filesystem.add_option('-w', '--no-overwrites',
3549 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3550 filesystem.add_option('-c', '--continue',
3551 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3552 filesystem.add_option('--cookies',
3553 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3554 filesystem.add_option('--no-part',
3555 action='store_true', dest='nopart', help='do not use .part files', default=False)
3556 filesystem.add_option('--no-mtime',
3557 action='store_false', dest='updatetime',
3558 help='do not use the Last-modified header to set the file modification time', default=True)
3559 filesystem.add_option('--write-description',
3560 action='store_true', dest='writedescription',
3561 help='write video description to a .description file', default=False)
3562 filesystem.add_option('--write-info-json',
3563 action='store_true', dest='writeinfojson',
3564 help='write video metadata to a .info.json file', default=False)
3567 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3568 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3569 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3570 help='"best", "aac" or "mp3"; best by default')
3573 parser.add_option_group(general)
3574 parser.add_option_group(selection)
3575 parser.add_option_group(filesystem)
3576 parser.add_option_group(verbosity)
3577 parser.add_option_group(video_format)
3578 parser.add_option_group(authentication)
3579 parser.add_option_group(postproc)
3581 opts, args = parser.parse_args()
3583 return parser, opts, args
3585 def gen_extractors():
3586 """ Return a list of an instance of every supported extractor.
3587 The order does matter; the first extractor matched is the one handling the URL.
3589 youtube_ie = YoutubeIE()
3590 google_ie = GoogleIE()
3591 yahoo_ie = YahooIE()
3594 MetacafeIE(youtube_ie),
3596 YoutubePlaylistIE(youtube_ie),
3597 YoutubeUserIE(youtube_ie),
3598 YoutubeSearchIE(youtube_ie),
3600 GoogleSearchIE(google_ie),
3603 YahooSearchIE(yahoo_ie),
3616 parser, opts, args = parseOpts()
3618 # Open appropriate CookieJar
3619 if opts.cookiefile is None:
3620 jar = cookielib.CookieJar()
3623 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3624 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3626 except (IOError, OSError), err:
3627 sys.exit(u'ERROR: unable to open cookie file')
3630 if opts.dump_user_agent:
3631 print std_headers['User-Agent']
3634 # Batch file verification
3636 if opts.batchfile is not None:
3638 if opts.batchfile == '-':
3641 batchfd = open(opts.batchfile, 'r')
3642 batchurls = batchfd.readlines()
3643 batchurls = [x.strip() for x in batchurls]
3644 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3646 sys.exit(u'ERROR: batch file could not be read')
3647 all_urls = batchurls + args
3649 # General configuration
3650 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3651 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3652 urllib2.install_opener(opener)
3653 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3655 extractors = gen_extractors()
3657 if opts.list_extractors:
3658 for ie in extractors:
3660 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3661 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3662 for mu in matchedUrls:
3666 # Conflicting, missing and erroneous options
3667 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3668 parser.error(u'using .netrc conflicts with giving username/password')
3669 if opts.password is not None and opts.username is None:
3670 parser.error(u'account username missing')
3671 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3672 parser.error(u'using output template conflicts with using title, literal title or auto number')
3673 if opts.usetitle and opts.useliteral:
3674 parser.error(u'using title conflicts with using literal title')
3675 if opts.username is not None and opts.password is None:
3676 opts.password = getpass.getpass(u'Type account password and press return:')
3677 if opts.ratelimit is not None:
3678 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3679 if numeric_limit is None:
3680 parser.error(u'invalid rate limit specified')
3681 opts.ratelimit = numeric_limit
3682 if opts.retries is not None:
3684 opts.retries = long(opts.retries)
3685 except (TypeError, ValueError), err:
3686 parser.error(u'invalid retry count specified')
3688 opts.playliststart = int(opts.playliststart)
3689 if opts.playliststart <= 0:
3690 raise ValueError(u'Playlist start must be positive')
3691 except (TypeError, ValueError), err:
3692 parser.error(u'invalid playlist start number specified')
3694 opts.playlistend = int(opts.playlistend)
3695 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3696 raise ValueError(u'Playlist end must be greater than playlist start')
3697 except (TypeError, ValueError), err:
3698 parser.error(u'invalid playlist end number specified')
3699 if opts.extractaudio:
3700 if opts.audioformat not in ['best', 'aac', 'mp3']:
3701 parser.error(u'invalid audio format specified')
3704 fd = FileDownloader({
3705 'usenetrc': opts.usenetrc,
3706 'username': opts.username,
3707 'password': opts.password,
3708 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3709 'forceurl': opts.geturl,
3710 'forcetitle': opts.gettitle,
3711 'forcethumbnail': opts.getthumbnail,
3712 'forcedescription': opts.getdescription,
3713 'forcefilename': opts.getfilename,
3714 'simulate': opts.simulate,
3715 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3716 'format': opts.format,
3717 'format_limit': opts.format_limit,
3718 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3719 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3720 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3721 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3722 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3723 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3724 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3725 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3726 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3727 or u'%(id)s.%(ext)s'),
3728 'ignoreerrors': opts.ignoreerrors,
3729 'ratelimit': opts.ratelimit,
3730 'nooverwrites': opts.nooverwrites,
3731 'retries': opts.retries,
3732 'continuedl': opts.continue_dl,
3733 'noprogress': opts.noprogress,
3734 'playliststart': opts.playliststart,
3735 'playlistend': opts.playlistend,
3736 'logtostderr': opts.outtmpl == '-',
3737 'consoletitle': opts.consoletitle,
3738 'nopart': opts.nopart,
3739 'updatetime': opts.updatetime,
3740 'writedescription': opts.writedescription,
3741 'writeinfojson': opts.writeinfojson,
3742 'matchtitle': opts.matchtitle,
3743 'rejecttitle': opts.rejecttitle,
3745 for extractor in extractors:
3746 fd.add_info_extractor(extractor)
3749 if opts.extractaudio:
3750 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3753 if opts.update_self:
3754 updateSelf(fd, sys.argv[0])
3757 if len(all_urls) < 1:
3758 if not opts.update_self:
3759 parser.error(u'you must provide at least one URL')
3762 retcode = fd.download(all_urls)
3764 # Dump cookie jar if requested
3765 if opts.cookiefile is not None:
3768 except (IOError, OSError), err:
3769 sys.exit(u'ERROR: unable to save cookie jar')
3774 if __name__ == '__main__':
3777 except DownloadError:
3779 except SameFileError:
3780 sys.exit(u'ERROR: fixed output name but more than one file to download')
3781 except KeyboardInterrupt:
3782 sys.exit(u'\nERROR: Interrupted by user')
3784 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: