2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.15'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
634 def report_writedescription(self, descfn):
635 """ Report that the description file is being written """
636 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638 def report_writeinfojson(self, infofn):
639 """ Report that the metadata file has been written """
640 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
642 def report_destination(self, filename):
643 """Report destination filename."""
644 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
646 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
647 """Report download progress."""
648 if self.params.get('noprogress', False):
650 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
651 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
652 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
653 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
655 def report_resuming_byte(self, resume_len):
656 """Report attempt to resume at given byte."""
657 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
659 def report_retry(self, count, retries):
660 """Report retry in case of HTTP error 5xx"""
661 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
663 def report_file_already_downloaded(self, file_name):
664 """Report file has already been fully downloaded."""
666 self.to_screen(u'[download] %s has already been downloaded' % file_name)
667 except (UnicodeEncodeError), err:
668 self.to_screen(u'[download] The file has already been downloaded')
670 def report_unable_to_resume(self):
671 """Report it was impossible to resume download."""
672 self.to_screen(u'[download] Unable to resume')
674 def report_finish(self):
675 """Report download finished."""
676 if self.params.get('noprogress', False):
677 self.to_screen(u'[download] Download completed')
681 def increment_downloads(self):
682 """Increment the ordinal that assigns a number to each file."""
683 self._num_downloads += 1
685 def prepare_filename(self, info_dict):
686 """Generate the output filename."""
688 template_dict = dict(info_dict)
689 template_dict['epoch'] = unicode(long(time.time()))
690 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
691 filename = self.params['outtmpl'] % template_dict
693 except (ValueError, KeyError), err:
694 self.trouble(u'ERROR: invalid system charset or erroneous output template')
697 def process_info(self, info_dict):
698 """Process a single dictionary returned by an InfoExtractor."""
699 filename = self.prepare_filename(info_dict)
702 if self.params.get('forcetitle', False):
703 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
704 if self.params.get('forceurl', False):
705 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
706 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
707 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
708 if self.params.get('forcedescription', False) and 'description' in info_dict:
709 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
710 if self.params.get('forcefilename', False) and filename is not None:
711 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713 # Do nothing else if in simulate mode
714 if self.params.get('simulate', False):
720 matchtitle=self.params.get('matchtitle',False)
721 rejecttitle=self.params.get('rejecttitle',False)
722 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
723 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
724 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
726 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
727 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
730 if self.params.get('nooverwrites', False) and os.path.exists(filename):
731 self.to_stderr(u'WARNING: file exists and will be skipped')
735 dn = os.path.dirname(filename)
736 if dn != '' and not os.path.exists(dn):
738 except (OSError, IOError), err:
739 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
742 if self.params.get('writedescription', False):
744 descfn = filename + '.description'
745 self.report_writedescription(descfn)
746 descfile = open(descfn, 'wb')
748 descfile.write(info_dict['description'].encode('utf-8'))
751 except (OSError, IOError):
752 self.trouble(u'ERROR: Cannot write description file ' + descfn)
755 if self.params.get('writeinfojson', False):
756 infofn = filename + '.info.json'
757 self.report_writeinfojson(infofn)
760 except (NameError,AttributeError):
761 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
764 infof = open(infofn, 'wb')
766 json.dump(info_dict, infof)
769 except (OSError, IOError):
770 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
773 if not self.params.get('skip_download', False):
775 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
776 except (OSError, IOError), err:
777 raise UnavailableVideoError
778 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
779 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
781 except (ContentTooShortError, ), err:
782 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
787 self.post_process(filename, info_dict)
788 except (PostProcessingError), err:
789 self.trouble(u'ERROR: postprocessing: %s' % str(err))
792 def download(self, url_list):
793 """Download a given list of URLs."""
794 if len(url_list) > 1 and self.fixed_template():
795 raise SameFileError(self.params['outtmpl'])
798 suitable_found = False
800 # Go to next InfoExtractor if not suitable
801 if not ie.suitable(url):
804 # Suitable InfoExtractor found
805 suitable_found = True
807 # Extract information from URL and process it
810 # Suitable InfoExtractor had been found; go to next URL
813 if not suitable_found:
814 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
816 return self._download_retcode
818 def post_process(self, filename, ie_info):
819 """Run the postprocessing chain on the given file."""
821 info['filepath'] = filename
827 def _download_with_rtmpdump(self, filename, url, player_url):
828 self.report_destination(filename)
829 tmpfilename = self.temp_name(filename)
831 # Check for rtmpdump first
833 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
834 except (OSError, IOError):
835 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
838 # Download using rtmpdump. rtmpdump returns exit code 2 when
839 # the connection was interrumpted and resuming appears to be
840 # possible. This is part of rtmpdump's normal usage, AFAIK.
841 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
842 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
843 while retval == 2 or retval == 1:
844 prevsize = os.path.getsize(tmpfilename)
845 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
846 time.sleep(5.0) # This seems to be needed
847 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
848 cursize = os.path.getsize(tmpfilename)
849 if prevsize == cursize and retval == 1:
851 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
852 if prevsize == cursize and retval == 2 and cursize > 1024:
853 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
857 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
858 self.try_rename(tmpfilename, filename)
861 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
864 def _do_download(self, filename, url, player_url):
865 # Check file already present
866 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
867 self.report_file_already_downloaded(filename)
870 # Attempt to download using rtmpdump
871 if url.startswith('rtmp'):
872 return self._download_with_rtmpdump(filename, url, player_url)
874 tmpfilename = self.temp_name(filename)
878 # Do not include the Accept-Encoding header
879 headers = {'Youtubedl-no-compression': 'True'}
880 basic_request = urllib2.Request(url, None, headers)
881 request = urllib2.Request(url, None, headers)
883 # Establish possible resume length
884 if os.path.isfile(tmpfilename):
885 resume_len = os.path.getsize(tmpfilename)
889 # Request parameters in case of being able to resume
890 if self.params.get('continuedl', False) and resume_len != 0:
891 self.report_resuming_byte(resume_len)
892 request.add_header('Range', 'bytes=%d-' % resume_len)
896 retries = self.params.get('retries', 0)
897 while count <= retries:
898 # Establish connection
900 data = urllib2.urlopen(request)
902 except (urllib2.HTTPError, ), err:
903 if (err.code < 500 or err.code >= 600) and err.code != 416:
904 # Unexpected HTTP error
906 elif err.code == 416:
907 # Unable to resume (requested range not satisfiable)
909 # Open the connection again without the range header
910 data = urllib2.urlopen(basic_request)
911 content_length = data.info()['Content-Length']
912 except (urllib2.HTTPError, ), err:
913 if err.code < 500 or err.code >= 600:
916 # Examine the reported length
917 if (content_length is not None and
918 (resume_len - 100 < long(content_length) < resume_len + 100)):
919 # The file had already been fully downloaded.
920 # Explanation to the above condition: in issue #175 it was revealed that
921 # YouTube sometimes adds or removes a few bytes from the end of the file,
922 # changing the file size slightly and causing problems for some users. So
923 # I decided to implement a suggested change and consider the file
924 # completely downloaded if the file size differs less than 100 bytes from
925 # the one in the hard drive.
926 self.report_file_already_downloaded(filename)
927 self.try_rename(tmpfilename, filename)
930 # The length does not match, we start the download over
931 self.report_unable_to_resume()
937 self.report_retry(count, retries)
940 self.trouble(u'ERROR: giving up after %s retries' % retries)
943 data_len = data.info().get('Content-length', None)
944 if data_len is not None:
945 data_len = long(data_len) + resume_len
946 data_len_str = self.format_bytes(data_len)
947 byte_counter = 0 + resume_len
953 data_block = data.read(block_size)
955 if len(data_block) == 0:
957 byte_counter += len(data_block)
959 # Open file just in time
962 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
963 assert stream is not None
964 filename = self.undo_temp_name(tmpfilename)
965 self.report_destination(filename)
966 except (OSError, IOError), err:
967 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
970 stream.write(data_block)
971 except (IOError, OSError), err:
972 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
974 block_size = self.best_block_size(after - before, len(data_block))
977 percent_str = self.calc_percent(byte_counter, data_len)
978 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
979 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
980 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
983 self.slow_down(start, byte_counter - resume_len)
986 self.trouble(u'\nERROR: Did not get any data blocks')
990 if data_len is not None and byte_counter != data_len:
991 raise ContentTooShortError(byte_counter, long(data_len))
992 self.try_rename(tmpfilename, filename)
994 # Update file modification time
995 if self.params.get('updatetime', True):
996 self.try_utime(filename, data.info().get('last-modified', None))
1001 class InfoExtractor(object):
1002 """Information Extractor class.
1004 Information extractors are the classes that, given a URL, extract
1005 information from the video (or videos) the URL refers to. This
1006 information includes the real video URL, the video title and simplified
1007 title, author and others. The information is stored in a dictionary
1008 which is then passed to the FileDownloader. The FileDownloader
1009 processes this information possibly downloading the video to the file
1010 system, among other possible outcomes. The dictionaries must include
1011 the following fields:
1013 id: Video identifier.
1014 url: Final video URL.
1015 uploader: Nickname of the video uploader.
1016 title: Literal title.
1017 stitle: Simplified title.
1018 ext: Video filename extension.
1019 format: Video format.
1020 player_url: SWF Player URL (may be None).
1022 The following fields are optional. Their primary purpose is to allow
1023 youtube-dl to serve as the backend for a video search function, such
1024 as the one in youtube2mp3. They are only used when their respective
1025 forced printing functions are called:
1027 thumbnail: Full URL to a video thumbnail image.
1028 description: One-line video description.
1030 Subclasses of this one should re-define the _real_initialize() and
1031 _real_extract() methods and define a _VALID_URL regexp.
1032 Probably, they should also be added to the list of extractors.
1038 def __init__(self, downloader=None):
1039 """Constructor. Receives an optional downloader."""
1041 self.set_downloader(downloader)
1043 def suitable(self, url):
1044 """Receives a URL and returns True if suitable for this IE."""
1045 return re.match(self._VALID_URL, url) is not None
1047 def initialize(self):
1048 """Initializes an instance (authentication, etc)."""
1050 self._real_initialize()
1053 def extract(self, url):
1054 """Extracts URL information and returns it in list of dicts."""
1056 return self._real_extract(url)
1058 def set_downloader(self, downloader):
1059 """Sets the downloader for this IE."""
1060 self._downloader = downloader
1062 def _real_initialize(self):
1063 """Real initialization process. Redefine in subclasses."""
1066 def _real_extract(self, url):
1067 """Real extraction process. Redefine in subclasses."""
1071 class YoutubeIE(InfoExtractor):
1072 """Information extractor for youtube.com."""
1074 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1075 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1076 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1077 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1078 _NETRC_MACHINE = 'youtube'
1079 # Listed in order of quality
1080 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1081 _video_extensions = {
1087 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1091 IE_NAME = u'youtube'
1093 def report_lang(self):
1094 """Report attempt to set language."""
1095 self._downloader.to_screen(u'[youtube] Setting language')
1097 def report_login(self):
1098 """Report attempt to log in."""
1099 self._downloader.to_screen(u'[youtube] Logging in')
1101 def report_age_confirmation(self):
1102 """Report attempt to confirm age."""
1103 self._downloader.to_screen(u'[youtube] Confirming age')
1105 def report_video_webpage_download(self, video_id):
1106 """Report attempt to download video webpage."""
1107 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1109 def report_video_info_webpage_download(self, video_id):
1110 """Report attempt to download video info webpage."""
1111 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1113 def report_information_extraction(self, video_id):
1114 """Report attempt to extract video information."""
1115 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1117 def report_unavailable_format(self, video_id, format):
1118 """Report extracted video URL."""
1119 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1121 def report_rtmp_download(self):
1122 """Indicate the download will use the RTMP protocol."""
1123 self._downloader.to_screen(u'[youtube] RTMP download detected')
1125 def _real_initialize(self):
1126 if self._downloader is None:
1131 downloader_params = self._downloader.params
1133 # Attempt to use provided username and password or .netrc data
1134 if downloader_params.get('username', None) is not None:
1135 username = downloader_params['username']
1136 password = downloader_params['password']
1137 elif downloader_params.get('usenetrc', False):
1139 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1140 if info is not None:
1144 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1145 except (IOError, netrc.NetrcParseError), err:
1146 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1150 request = urllib2.Request(self._LANG_URL)
1153 urllib2.urlopen(request).read()
1154 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1155 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1158 # No authentication to be performed
1159 if username is None:
1164 'current_form': 'loginForm',
1166 'action_login': 'Log In',
1167 'username': username,
1168 'password': password,
1170 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1173 login_results = urllib2.urlopen(request).read()
1174 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1175 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1177 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1178 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1184 'action_confirm': 'Confirm',
1186 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1188 self.report_age_confirmation()
1189 age_results = urllib2.urlopen(request).read()
1190 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1191 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1194 def _real_extract(self, url):
1195 # Extract video id from URL
1196 mobj = re.match(self._VALID_URL, url)
1198 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1200 video_id = mobj.group(2)
1203 self.report_video_webpage_download(video_id)
1204 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1206 video_webpage = urllib2.urlopen(request).read()
1207 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1208 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1211 # Attempt to extract SWF player URL
1212 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1213 if mobj is not None:
1214 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1219 self.report_video_info_webpage_download(video_id)
1220 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1221 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1222 % (video_id, el_type))
1223 request = urllib2.Request(video_info_url)
1225 video_info_webpage = urllib2.urlopen(request).read()
1226 video_info = parse_qs(video_info_webpage)
1227 if 'token' in video_info:
1229 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1230 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1232 if 'token' not in video_info:
1233 if 'reason' in video_info:
1234 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1236 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1239 # Start extracting information
1240 self.report_information_extraction(video_id)
1243 if 'author' not in video_info:
1244 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1246 video_uploader = urllib.unquote_plus(video_info['author'][0])
1249 if 'title' not in video_info:
1250 self._downloader.trouble(u'ERROR: unable to extract video title')
1252 video_title = urllib.unquote_plus(video_info['title'][0])
1253 video_title = video_title.decode('utf-8')
1254 video_title = sanitize_title(video_title)
1257 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1258 simple_title = simple_title.strip(ur'_')
1261 if 'thumbnail_url' not in video_info:
1262 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1263 video_thumbnail = ''
1264 else: # don't panic if we can't find it
1265 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1269 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1270 if mobj is not None:
1271 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1272 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1273 for expression in format_expressions:
1275 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1283 video_description = u'No description available.'
1284 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1285 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1286 if mobj is not None:
1287 video_description = mobj.group(1).decode('utf-8')
1289 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1290 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1291 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1292 # TODO use another parser
1295 video_token = urllib.unquote_plus(video_info['token'][0])
1297 # Decide which formats to download
1298 req_format = self._downloader.params.get('format', None)
1300 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1301 self.report_rtmp_download()
1302 video_url_list = [(None, video_info['conn'][0])]
1303 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1304 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1305 url_data = [parse_qs(uds) for uds in url_data_strs]
1306 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1307 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1309 format_limit = self._downloader.params.get('format_limit', None)
1310 if format_limit is not None and format_limit in self._available_formats:
1311 format_list = self._available_formats[self._available_formats.index(format_limit):]
1313 format_list = self._available_formats
1314 existing_formats = [x for x in format_list if x in url_map]
1315 if len(existing_formats) == 0:
1316 self._downloader.trouble(u'ERROR: no known formats available for video')
1318 if req_format is None:
1319 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1320 elif req_format == '-1':
1321 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1324 if req_format not in url_map:
1325 self._downloader.trouble(u'ERROR: requested format not available')
1327 video_url_list = [(req_format, url_map[req_format])] # Specific format
1329 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1332 for format_param, video_real_url in video_url_list:
1333 # At this point we have a new video
1334 self._downloader.increment_downloads()
1337 video_extension = self._video_extensions.get(format_param, 'flv')
1340 # Process video information
1341 self._downloader.process_info({
1342 'id': video_id.decode('utf-8'),
1343 'url': video_real_url.decode('utf-8'),
1344 'uploader': video_uploader.decode('utf-8'),
1345 'upload_date': upload_date,
1346 'title': video_title,
1347 'stitle': simple_title,
1348 'ext': video_extension.decode('utf-8'),
1349 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1350 'thumbnail': video_thumbnail.decode('utf-8'),
1351 'description': video_description,
1352 'player_url': player_url,
1354 except UnavailableVideoError, err:
1355 self._downloader.trouble(u'\nERROR: unable to download video')
1358 class MetacafeIE(InfoExtractor):
1359 """Information Extractor for metacafe.com."""
1361 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1362 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1363 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1365 IE_NAME = u'metacafe'
1367 def __init__(self, youtube_ie, downloader=None):
1368 InfoExtractor.__init__(self, downloader)
1369 self._youtube_ie = youtube_ie
1371 def report_disclaimer(self):
1372 """Report disclaimer retrieval."""
1373 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1375 def report_age_confirmation(self):
1376 """Report attempt to confirm age."""
1377 self._downloader.to_screen(u'[metacafe] Confirming age')
1379 def report_download_webpage(self, video_id):
1380 """Report webpage download."""
1381 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1383 def report_extraction(self, video_id):
1384 """Report information extraction."""
1385 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1387 def _real_initialize(self):
1388 # Retrieve disclaimer
1389 request = urllib2.Request(self._DISCLAIMER)
1391 self.report_disclaimer()
1392 disclaimer = urllib2.urlopen(request).read()
1393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1394 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1400 'submit': "Continue - I'm over 18",
1402 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1404 self.report_age_confirmation()
1405 disclaimer = urllib2.urlopen(request).read()
1406 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1407 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1410 def _real_extract(self, url):
1411 # Extract id and simplified title from URL
1412 mobj = re.match(self._VALID_URL, url)
1414 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1417 video_id = mobj.group(1)
1419 # Check if video comes from YouTube
1420 mobj2 = re.match(r'^yt-(.*)$', video_id)
1421 if mobj2 is not None:
1422 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1425 # At this point we have a new video
1426 self._downloader.increment_downloads()
1428 simple_title = mobj.group(2).decode('utf-8')
1430 # Retrieve video webpage to extract further information
1431 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1433 self.report_download_webpage(video_id)
1434 webpage = urllib2.urlopen(request).read()
1435 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1436 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1439 # Extract URL, uploader and title from webpage
1440 self.report_extraction(video_id)
1441 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1442 if mobj is not None:
1443 mediaURL = urllib.unquote(mobj.group(1))
1444 video_extension = mediaURL[-3:]
1446 # Extract gdaKey if available
1447 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1449 video_url = mediaURL
1451 gdaKey = mobj.group(1)
1452 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1454 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1456 self._downloader.trouble(u'ERROR: unable to extract media URL')
1458 vardict = parse_qs(mobj.group(1))
1459 if 'mediaData' not in vardict:
1460 self._downloader.trouble(u'ERROR: unable to extract media URL')
1462 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1464 self._downloader.trouble(u'ERROR: unable to extract media URL')
1466 mediaURL = mobj.group(1).replace('\\/', '/')
1467 video_extension = mediaURL[-3:]
1468 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1470 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1472 self._downloader.trouble(u'ERROR: unable to extract title')
1474 video_title = mobj.group(1).decode('utf-8')
1475 video_title = sanitize_title(video_title)
1477 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1479 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1481 video_uploader = mobj.group(1)
1484 # Process video information
1485 self._downloader.process_info({
1486 'id': video_id.decode('utf-8'),
1487 'url': video_url.decode('utf-8'),
1488 'uploader': video_uploader.decode('utf-8'),
1489 'upload_date': u'NA',
1490 'title': video_title,
1491 'stitle': simple_title,
1492 'ext': video_extension.decode('utf-8'),
1496 except UnavailableVideoError:
1497 self._downloader.trouble(u'\nERROR: unable to download video')
1500 class DailymotionIE(InfoExtractor):
1501 """Information Extractor for Dailymotion"""
1503 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1504 IE_NAME = u'dailymotion'
1506 def __init__(self, downloader=None):
1507 InfoExtractor.__init__(self, downloader)
1509 def report_download_webpage(self, video_id):
1510 """Report webpage download."""
1511 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1513 def report_extraction(self, video_id):
1514 """Report information extraction."""
1515 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1517 def _real_initialize(self):
1520 def _real_extract(self, url):
1521 # Extract id and simplified title from URL
1522 mobj = re.match(self._VALID_URL, url)
1524 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1527 # At this point we have a new video
1528 self._downloader.increment_downloads()
1529 video_id = mobj.group(1)
1531 simple_title = mobj.group(2).decode('utf-8')
1532 video_extension = 'flv'
1534 # Retrieve video webpage to extract further information
1535 request = urllib2.Request(url)
1536 request.add_header('Cookie', 'family_filter=off')
1538 self.report_download_webpage(video_id)
1539 webpage = urllib2.urlopen(request).read()
1540 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1541 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1544 # Extract URL, uploader and title from webpage
1545 self.report_extraction(video_id)
1546 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1548 self._downloader.trouble(u'ERROR: unable to extract media URL')
1550 sequence = urllib.unquote(mobj.group(1))
1551 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1553 self._downloader.trouble(u'ERROR: unable to extract media URL')
1555 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1557 # if needed add http://www.dailymotion.com/ if relative URL
1559 video_url = mediaURL
1561 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1563 self._downloader.trouble(u'ERROR: unable to extract title')
1565 video_title = mobj.group(1).decode('utf-8')
1566 video_title = sanitize_title(video_title)
1568 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1570 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1572 video_uploader = mobj.group(1)
1575 # Process video information
1576 self._downloader.process_info({
1577 'id': video_id.decode('utf-8'),
1578 'url': video_url.decode('utf-8'),
1579 'uploader': video_uploader.decode('utf-8'),
1580 'upload_date': u'NA',
1581 'title': video_title,
1582 'stitle': simple_title,
1583 'ext': video_extension.decode('utf-8'),
1587 except UnavailableVideoError:
1588 self._downloader.trouble(u'\nERROR: unable to download video')
1591 class GoogleIE(InfoExtractor):
1592 """Information extractor for video.google.com."""
1594 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1595 IE_NAME = u'video.google'
1597 def __init__(self, downloader=None):
1598 InfoExtractor.__init__(self, downloader)
1600 def report_download_webpage(self, video_id):
1601 """Report webpage download."""
1602 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1604 def report_extraction(self, video_id):
1605 """Report information extraction."""
1606 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1608 def _real_initialize(self):
1611 def _real_extract(self, url):
1612 # Extract id from URL
1613 mobj = re.match(self._VALID_URL, url)
1615 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1618 # At this point we have a new video
1619 self._downloader.increment_downloads()
1620 video_id = mobj.group(1)
1622 video_extension = 'mp4'
1624 # Retrieve video webpage to extract further information
1625 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1627 self.report_download_webpage(video_id)
1628 webpage = urllib2.urlopen(request).read()
1629 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1630 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1633 # Extract URL, uploader, and title from webpage
1634 self.report_extraction(video_id)
1635 mobj = re.search(r"download_url:'([^']+)'", webpage)
1637 video_extension = 'flv'
1638 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1640 self._downloader.trouble(u'ERROR: unable to extract media URL')
1642 mediaURL = urllib.unquote(mobj.group(1))
1643 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1644 mediaURL = mediaURL.replace('\\x26', '\x26')
1646 video_url = mediaURL
1648 mobj = re.search(r'<title>(.*)</title>', webpage)
1650 self._downloader.trouble(u'ERROR: unable to extract title')
1652 video_title = mobj.group(1).decode('utf-8')
1653 video_title = sanitize_title(video_title)
1654 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1656 # Extract video description
1657 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1659 self._downloader.trouble(u'ERROR: unable to extract video description')
1661 video_description = mobj.group(1).decode('utf-8')
1662 if not video_description:
1663 video_description = 'No description available.'
1665 # Extract video thumbnail
1666 if self._downloader.params.get('forcethumbnail', False):
1667 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1669 webpage = urllib2.urlopen(request).read()
1670 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1671 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1673 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1675 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1677 video_thumbnail = mobj.group(1)
1678 else: # we need something to pass to process_info
1679 video_thumbnail = ''
1682 # Process video information
1683 self._downloader.process_info({
1684 'id': video_id.decode('utf-8'),
1685 'url': video_url.decode('utf-8'),
1687 'upload_date': u'NA',
1688 'title': video_title,
1689 'stitle': simple_title,
1690 'ext': video_extension.decode('utf-8'),
1694 except UnavailableVideoError:
1695 self._downloader.trouble(u'\nERROR: unable to download video')
1698 class PhotobucketIE(InfoExtractor):
1699 """Information extractor for photobucket.com."""
1701 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1702 IE_NAME = u'photobucket'
1704 def __init__(self, downloader=None):
1705 InfoExtractor.__init__(self, downloader)
1707 def report_download_webpage(self, video_id):
1708 """Report webpage download."""
1709 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1711 def report_extraction(self, video_id):
1712 """Report information extraction."""
1713 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1715 def _real_initialize(self):
1718 def _real_extract(self, url):
1719 # Extract id from URL
1720 mobj = re.match(self._VALID_URL, url)
1722 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1725 # At this point we have a new video
1726 self._downloader.increment_downloads()
1727 video_id = mobj.group(1)
1729 video_extension = 'flv'
1731 # Retrieve video webpage to extract further information
1732 request = urllib2.Request(url)
1734 self.report_download_webpage(video_id)
1735 webpage = urllib2.urlopen(request).read()
1736 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1737 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1740 # Extract URL, uploader, and title from webpage
1741 self.report_extraction(video_id)
1742 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1744 self._downloader.trouble(u'ERROR: unable to extract media URL')
1746 mediaURL = urllib.unquote(mobj.group(1))
1748 video_url = mediaURL
1750 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1752 self._downloader.trouble(u'ERROR: unable to extract title')
1754 video_title = mobj.group(1).decode('utf-8')
1755 video_title = sanitize_title(video_title)
1756 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1758 video_uploader = mobj.group(2).decode('utf-8')
1761 # Process video information
1762 self._downloader.process_info({
1763 'id': video_id.decode('utf-8'),
1764 'url': video_url.decode('utf-8'),
1765 'uploader': video_uploader,
1766 'upload_date': u'NA',
1767 'title': video_title,
1768 'stitle': simple_title,
1769 'ext': video_extension.decode('utf-8'),
1773 except UnavailableVideoError:
1774 self._downloader.trouble(u'\nERROR: unable to download video')
1777 class YahooIE(InfoExtractor):
1778 """Information extractor for video.yahoo.com."""
1780 # _VALID_URL matches all Yahoo! Video URLs
1781 # _VPAGE_URL matches only the extractable '/watch/' URLs
1782 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1783 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1784 IE_NAME = u'video.yahoo'
1786 def __init__(self, downloader=None):
1787 InfoExtractor.__init__(self, downloader)
1789 def report_download_webpage(self, video_id):
1790 """Report webpage download."""
1791 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1793 def report_extraction(self, video_id):
1794 """Report information extraction."""
1795 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1797 def _real_initialize(self):
1800 def _real_extract(self, url, new_video=True):
1801 # Extract ID from URL
1802 mobj = re.match(self._VALID_URL, url)
1804 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1807 # At this point we have a new video
1808 self._downloader.increment_downloads()
1809 video_id = mobj.group(2)
1810 video_extension = 'flv'
1812 # Rewrite valid but non-extractable URLs as
1813 # extractable English language /watch/ URLs
1814 if re.match(self._VPAGE_URL, url) is None:
1815 request = urllib2.Request(url)
1817 webpage = urllib2.urlopen(request).read()
1818 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1819 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1822 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1824 self._downloader.trouble(u'ERROR: Unable to extract id field')
1826 yahoo_id = mobj.group(1)
1828 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1830 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1832 yahoo_vid = mobj.group(1)
1834 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1835 return self._real_extract(url, new_video=False)
1837 # Retrieve video webpage to extract further information
1838 request = urllib2.Request(url)
1840 self.report_download_webpage(video_id)
1841 webpage = urllib2.urlopen(request).read()
1842 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1843 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1846 # Extract uploader and title from webpage
1847 self.report_extraction(video_id)
1848 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1850 self._downloader.trouble(u'ERROR: unable to extract video title')
1852 video_title = mobj.group(1).decode('utf-8')
1853 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1855 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1857 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1859 video_uploader = mobj.group(1).decode('utf-8')
1861 # Extract video thumbnail
1862 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1864 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1866 video_thumbnail = mobj.group(1).decode('utf-8')
1868 # Extract video description
1869 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1871 self._downloader.trouble(u'ERROR: unable to extract video description')
1873 video_description = mobj.group(1).decode('utf-8')
1874 if not video_description:
1875 video_description = 'No description available.'
1877 # Extract video height and width
1878 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1880 self._downloader.trouble(u'ERROR: unable to extract video height')
1882 yv_video_height = mobj.group(1)
1884 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1886 self._downloader.trouble(u'ERROR: unable to extract video width')
1888 yv_video_width = mobj.group(1)
1890 # Retrieve video playlist to extract media URL
1891 # I'm not completely sure what all these options are, but we
1892 # seem to need most of them, otherwise the server sends a 401.
1893 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1894 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1895 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1896 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1897 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1899 self.report_download_webpage(video_id)
1900 webpage = urllib2.urlopen(request).read()
1901 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1902 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1905 # Extract media URL from playlist XML
1906 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1908 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1910 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1911 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1914 # Process video information
1915 self._downloader.process_info({
1916 'id': video_id.decode('utf-8'),
1918 'uploader': video_uploader,
1919 'upload_date': u'NA',
1920 'title': video_title,
1921 'stitle': simple_title,
1922 'ext': video_extension.decode('utf-8'),
1923 'thumbnail': video_thumbnail.decode('utf-8'),
1924 'description': video_description,
1925 'thumbnail': video_thumbnail,
1928 except UnavailableVideoError:
1929 self._downloader.trouble(u'\nERROR: unable to download video')
1932 class VimeoIE(InfoExtractor):
1933 """Information extractor for vimeo.com."""
1935 # _VALID_URL matches Vimeo URLs
1936 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1939 def __init__(self, downloader=None):
1940 InfoExtractor.__init__(self, downloader)
1942 def report_download_webpage(self, video_id):
1943 """Report webpage download."""
1944 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1946 def report_extraction(self, video_id):
1947 """Report information extraction."""
1948 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1950 def _real_initialize(self):
1953 def _real_extract(self, url, new_video=True):
1954 # Extract ID from URL
1955 mobj = re.match(self._VALID_URL, url)
1957 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1960 # At this point we have a new video
1961 self._downloader.increment_downloads()
1962 video_id = mobj.group(1)
1964 # Retrieve video webpage to extract further information
1965 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1967 self.report_download_webpage(video_id)
1968 webpage = urllib2.urlopen(request).read()
1969 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1970 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1973 # Now we begin extracting as much information as we can from what we
1974 # retrieved. First we extract the information common to all extractors,
1975 # and latter we extract those that are Vimeo specific.
1976 self.report_extraction(video_id)
1979 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1981 self._downloader.trouble(u'ERROR: unable to extract video title')
1983 video_title = mobj.group(1).decode('utf-8')
1984 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1987 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1989 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1991 video_uploader = mobj.group(1).decode('utf-8')
1993 # Extract video thumbnail
1994 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1996 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1998 video_thumbnail = mobj.group(1).decode('utf-8')
2000 # # Extract video description
2001 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2003 # self._downloader.trouble(u'ERROR: unable to extract video description')
2005 # video_description = mobj.group(1).decode('utf-8')
2006 # if not video_description: video_description = 'No description available.'
2007 video_description = 'Foo.'
2009 # Vimeo specific: extract request signature
2010 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2012 self._downloader.trouble(u'ERROR: unable to extract request signature')
2014 sig = mobj.group(1).decode('utf-8')
2016 # Vimeo specific: Extract request signature expiration
2017 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2019 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2021 sig_exp = mobj.group(1).decode('utf-8')
2023 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2026 # Process video information
2027 self._downloader.process_info({
2028 'id': video_id.decode('utf-8'),
2030 'uploader': video_uploader,
2031 'upload_date': u'NA',
2032 'title': video_title,
2033 'stitle': simple_title,
2035 'thumbnail': video_thumbnail.decode('utf-8'),
2036 'description': video_description,
2037 'thumbnail': video_thumbnail,
2038 'description': video_description,
2041 except UnavailableVideoError:
2042 self._downloader.trouble(u'ERROR: unable to download video')
2045 class GenericIE(InfoExtractor):
2046 """Generic last-resort information extractor."""
2049 IE_NAME = u'generic'
2051 def __init__(self, downloader=None):
2052 InfoExtractor.__init__(self, downloader)
2054 def report_download_webpage(self, video_id):
2055 """Report webpage download."""
2056 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2057 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2059 def report_extraction(self, video_id):
2060 """Report information extraction."""
2061 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2063 def _real_initialize(self):
2066 def _real_extract(self, url):
2067 # At this point we have a new video
2068 self._downloader.increment_downloads()
2070 video_id = url.split('/')[-1]
2071 request = urllib2.Request(url)
2073 self.report_download_webpage(video_id)
2074 webpage = urllib2.urlopen(request).read()
2075 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2076 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2078 except ValueError, err:
2079 # since this is the last-resort InfoExtractor, if
2080 # this error is thrown, it'll be thrown here
2081 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2084 self.report_extraction(video_id)
2085 # Start with something easy: JW Player in SWFObject
2086 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2088 # Broaden the search a little bit
2089 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2091 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2094 # It's possible that one of the regexes
2095 # matched, but returned an empty group:
2096 if mobj.group(1) is None:
2097 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2100 video_url = urllib.unquote(mobj.group(1))
2101 video_id = os.path.basename(video_url)
2103 # here's a fun little line of code for you:
2104 video_extension = os.path.splitext(video_id)[1][1:]
2105 video_id = os.path.splitext(video_id)[0]
2107 # it's tempting to parse this further, but you would
2108 # have to take into account all the variations like
2109 # Video Title - Site Name
2110 # Site Name | Video Title
2111 # Video Title - Tagline | Site Name
2112 # and so on and so forth; it's just not practical
2113 mobj = re.search(r'<title>(.*)</title>', webpage)
2115 self._downloader.trouble(u'ERROR: unable to extract title')
2117 video_title = mobj.group(1).decode('utf-8')
2118 video_title = sanitize_title(video_title)
2119 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2121 # video uploader is domain name
2122 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2124 self._downloader.trouble(u'ERROR: unable to extract title')
2126 video_uploader = mobj.group(1).decode('utf-8')
2129 # Process video information
2130 self._downloader.process_info({
2131 'id': video_id.decode('utf-8'),
2132 'url': video_url.decode('utf-8'),
2133 'uploader': video_uploader,
2134 'upload_date': u'NA',
2135 'title': video_title,
2136 'stitle': simple_title,
2137 'ext': video_extension.decode('utf-8'),
2141 except UnavailableVideoError, err:
2142 self._downloader.trouble(u'\nERROR: unable to download video')
2145 class YoutubeSearchIE(InfoExtractor):
2146 """Information Extractor for YouTube search queries."""
2147 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2148 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2149 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2150 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2152 _max_youtube_results = 1000
2153 IE_NAME = u'youtube:search'
2155 def __init__(self, youtube_ie, downloader=None):
2156 InfoExtractor.__init__(self, downloader)
2157 self._youtube_ie = youtube_ie
2159 def report_download_page(self, query, pagenum):
2160 """Report attempt to download playlist page with given number."""
2161 query = query.decode(preferredencoding())
2162 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2164 def _real_initialize(self):
2165 self._youtube_ie.initialize()
2167 def _real_extract(self, query):
2168 mobj = re.match(self._VALID_URL, query)
2170 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2173 prefix, query = query.split(':')
2175 query = query.encode('utf-8')
2177 self._download_n_results(query, 1)
2179 elif prefix == 'all':
2180 self._download_n_results(query, self._max_youtube_results)
2186 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2188 elif n > self._max_youtube_results:
2189 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2190 n = self._max_youtube_results
2191 self._download_n_results(query, n)
2193 except ValueError: # parsing prefix as integer fails
2194 self._download_n_results(query, 1)
2197 def _download_n_results(self, query, n):
2198 """Downloads a specified number of results for a query"""
2201 already_seen = set()
2205 self.report_download_page(query, pagenum)
2206 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2207 request = urllib2.Request(result_url)
2209 page = urllib2.urlopen(request).read()
2210 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2211 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2214 # Extract video identifiers
2215 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2216 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2217 if video_id not in already_seen:
2218 video_ids.append(video_id)
2219 already_seen.add(video_id)
2220 if len(video_ids) == n:
2221 # Specified n videos reached
2222 for id in video_ids:
2223 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2226 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2227 for id in video_ids:
2228 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2231 pagenum = pagenum + 1
2234 class GoogleSearchIE(InfoExtractor):
2235 """Information Extractor for Google Video search queries."""
2236 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2237 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2238 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2239 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2241 _max_google_results = 1000
2242 IE_NAME = u'video.google:search'
2244 def __init__(self, google_ie, downloader=None):
2245 InfoExtractor.__init__(self, downloader)
2246 self._google_ie = google_ie
2248 def report_download_page(self, query, pagenum):
2249 """Report attempt to download playlist page with given number."""
2250 query = query.decode(preferredencoding())
2251 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2253 def _real_initialize(self):
2254 self._google_ie.initialize()
2256 def _real_extract(self, query):
2257 mobj = re.match(self._VALID_URL, query)
2259 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2262 prefix, query = query.split(':')
2264 query = query.encode('utf-8')
2266 self._download_n_results(query, 1)
2268 elif prefix == 'all':
2269 self._download_n_results(query, self._max_google_results)
2275 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2277 elif n > self._max_google_results:
2278 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2279 n = self._max_google_results
2280 self._download_n_results(query, n)
2282 except ValueError: # parsing prefix as integer fails
2283 self._download_n_results(query, 1)
2286 def _download_n_results(self, query, n):
2287 """Downloads a specified number of results for a query"""
2290 already_seen = set()
2294 self.report_download_page(query, pagenum)
2295 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2296 request = urllib2.Request(result_url)
2298 page = urllib2.urlopen(request).read()
2299 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2300 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2303 # Extract video identifiers
2304 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2305 video_id = mobj.group(1)
2306 if video_id not in already_seen:
2307 video_ids.append(video_id)
2308 already_seen.add(video_id)
2309 if len(video_ids) == n:
2310 # Specified n videos reached
2311 for id in video_ids:
2312 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2315 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2316 for id in video_ids:
2317 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2320 pagenum = pagenum + 1
2323 class YahooSearchIE(InfoExtractor):
2324 """Information Extractor for Yahoo! Video search queries."""
2325 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2326 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2327 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2328 _MORE_PAGES_INDICATOR = r'\s*Next'
2330 _max_yahoo_results = 1000
2331 IE_NAME = u'video.yahoo:search'
2333 def __init__(self, yahoo_ie, downloader=None):
2334 InfoExtractor.__init__(self, downloader)
2335 self._yahoo_ie = yahoo_ie
2337 def report_download_page(self, query, pagenum):
2338 """Report attempt to download playlist page with given number."""
2339 query = query.decode(preferredencoding())
2340 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2342 def _real_initialize(self):
2343 self._yahoo_ie.initialize()
2345 def _real_extract(self, query):
2346 mobj = re.match(self._VALID_URL, query)
2348 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2351 prefix, query = query.split(':')
2353 query = query.encode('utf-8')
2355 self._download_n_results(query, 1)
2357 elif prefix == 'all':
2358 self._download_n_results(query, self._max_yahoo_results)
2364 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2366 elif n > self._max_yahoo_results:
2367 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2368 n = self._max_yahoo_results
2369 self._download_n_results(query, n)
2371 except ValueError: # parsing prefix as integer fails
2372 self._download_n_results(query, 1)
2375 def _download_n_results(self, query, n):
2376 """Downloads a specified number of results for a query"""
2379 already_seen = set()
2383 self.report_download_page(query, pagenum)
2384 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2385 request = urllib2.Request(result_url)
2387 page = urllib2.urlopen(request).read()
2388 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2389 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2392 # Extract video identifiers
2393 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2394 video_id = mobj.group(1)
2395 if video_id not in already_seen:
2396 video_ids.append(video_id)
2397 already_seen.add(video_id)
2398 if len(video_ids) == n:
2399 # Specified n videos reached
2400 for id in video_ids:
2401 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2404 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2405 for id in video_ids:
2406 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2409 pagenum = pagenum + 1
2412 class YoutubePlaylistIE(InfoExtractor):
2413 """Information Extractor for YouTube playlists."""
2415 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2416 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2417 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2418 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2420 IE_NAME = u'youtube:playlist'
2422 def __init__(self, youtube_ie, downloader=None):
2423 InfoExtractor.__init__(self, downloader)
2424 self._youtube_ie = youtube_ie
2426 def report_download_page(self, playlist_id, pagenum):
2427 """Report attempt to download playlist page with given number."""
2428 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2430 def _real_initialize(self):
2431 self._youtube_ie.initialize()
2433 def _real_extract(self, url):
2434 # Extract playlist id
2435 mobj = re.match(self._VALID_URL, url)
2437 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2441 if mobj.group(3) is not None:
2442 self._youtube_ie.extract(mobj.group(3))
2445 # Download playlist pages
2446 # prefix is 'p' as default for playlists but there are other types that need extra care
2447 playlist_prefix = mobj.group(1)
2448 if playlist_prefix == 'a':
2449 playlist_access = 'artist'
2451 playlist_prefix = 'p'
2452 playlist_access = 'view_play_list'
2453 playlist_id = mobj.group(2)
2458 self.report_download_page(playlist_id, pagenum)
2459 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2461 page = urllib2.urlopen(request).read()
2462 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2463 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2466 # Extract video identifiers
2468 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2469 if mobj.group(1) not in ids_in_page:
2470 ids_in_page.append(mobj.group(1))
2471 video_ids.extend(ids_in_page)
2473 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2475 pagenum = pagenum + 1
2477 playliststart = self._downloader.params.get('playliststart', 1) - 1
2478 playlistend = self._downloader.params.get('playlistend', -1)
2479 video_ids = video_ids[playliststart:playlistend]
2481 for id in video_ids:
2482 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2486 class YoutubeUserIE(InfoExtractor):
2487 """Information Extractor for YouTube users."""
2489 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2490 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2491 _GDATA_PAGE_SIZE = 50
2492 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2493 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2495 IE_NAME = u'youtube:user'
2497 def __init__(self, youtube_ie, downloader=None):
2498 InfoExtractor.__init__(self, downloader)
2499 self._youtube_ie = youtube_ie
2501 def report_download_page(self, username, start_index):
2502 """Report attempt to download user page."""
2503 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2504 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2506 def _real_initialize(self):
2507 self._youtube_ie.initialize()
2509 def _real_extract(self, url):
2511 mobj = re.match(self._VALID_URL, url)
2513 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2516 username = mobj.group(1)
2518 # Download video ids using YouTube Data API. Result size per
2519 # query is limited (currently to 50 videos) so we need to query
2520 # page by page until there are no video ids - it means we got
2527 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2528 self.report_download_page(username, start_index)
2530 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2533 page = urllib2.urlopen(request).read()
2534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2538 # Extract video identifiers
2541 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2542 if mobj.group(1) not in ids_in_page:
2543 ids_in_page.append(mobj.group(1))
2545 video_ids.extend(ids_in_page)
2547 # A little optimization - if current page is not
2548 # "full", ie. does not contain PAGE_SIZE video ids then
2549 # we can assume that this page is the last one - there
2550 # are no more ids on further pages - no need to query
2553 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2558 all_ids_count = len(video_ids)
2559 playliststart = self._downloader.params.get('playliststart', 1) - 1
2560 playlistend = self._downloader.params.get('playlistend', -1)
2562 if playlistend == -1:
2563 video_ids = video_ids[playliststart:]
2565 video_ids = video_ids[playliststart:playlistend]
2567 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2568 (username, all_ids_count, len(video_ids)))
2570 for video_id in video_ids:
2571 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2574 class DepositFilesIE(InfoExtractor):
2575 """Information extractor for depositfiles.com"""
2577 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2578 IE_NAME = u'DepositFiles'
2580 def __init__(self, downloader=None):
2581 InfoExtractor.__init__(self, downloader)
2583 def report_download_webpage(self, file_id):
2584 """Report webpage download."""
2585 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2587 def report_extraction(self, file_id):
2588 """Report information extraction."""
2589 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2591 def _real_initialize(self):
2594 def _real_extract(self, url):
2595 # At this point we have a new file
2596 self._downloader.increment_downloads()
2598 file_id = url.split('/')[-1]
2599 # Rebuild url in english locale
2600 url = 'http://depositfiles.com/en/files/' + file_id
2602 # Retrieve file webpage with 'Free download' button pressed
2603 free_download_indication = { 'gateway_result' : '1' }
2604 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2606 self.report_download_webpage(file_id)
2607 webpage = urllib2.urlopen(request).read()
2608 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2609 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2612 # Search for the real file URL
2613 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2614 if (mobj is None) or (mobj.group(1) is None):
2615 # Try to figure out reason of the error.
2616 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2617 if (mobj is not None) and (mobj.group(1) is not None):
2618 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2619 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2621 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2624 file_url = mobj.group(1)
2625 file_extension = os.path.splitext(file_url)[1][1:]
2627 # Search for file title
2628 mobj = re.search(r'<b title="(.*?)">', webpage)
2630 self._downloader.trouble(u'ERROR: unable to extract title')
2632 file_title = mobj.group(1).decode('utf-8')
2635 # Process file information
2636 self._downloader.process_info({
2637 'id': file_id.decode('utf-8'),
2638 'url': file_url.decode('utf-8'),
2640 'upload_date': u'NA',
2641 'title': file_title,
2642 'stitle': file_title,
2643 'ext': file_extension.decode('utf-8'),
2647 except UnavailableVideoError, err:
2648 self._downloader.trouble(u'ERROR: unable to download file')
2651 class FacebookIE(InfoExtractor):
2652 """Information Extractor for Facebook"""
2654 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2655 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2656 _NETRC_MACHINE = 'facebook'
2657 _available_formats = ['highqual', 'lowqual']
2658 _video_extensions = {
2662 IE_NAME = u'facebook'
2664 def __init__(self, downloader=None):
2665 InfoExtractor.__init__(self, downloader)
2667 def _reporter(self, message):
2668 """Add header and report message."""
2669 self._downloader.to_screen(u'[facebook] %s' % message)
2671 def report_login(self):
2672 """Report attempt to log in."""
2673 self._reporter(u'Logging in')
2675 def report_video_webpage_download(self, video_id):
2676 """Report attempt to download video webpage."""
2677 self._reporter(u'%s: Downloading video webpage' % video_id)
2679 def report_information_extraction(self, video_id):
2680 """Report attempt to extract video information."""
2681 self._reporter(u'%s: Extracting video information' % video_id)
2683 def _parse_page(self, video_webpage):
2684 """Extract video information from page"""
2686 data = {'title': r'class="video_title datawrap">(.*?)</',
2687 'description': r'<div class="datawrap">(.*?)</div>',
2688 'owner': r'\("video_owner_name", "(.*?)"\)',
2689 'upload_date': r'data-date="(.*?)"',
2690 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2693 for piece in data.keys():
2694 mobj = re.search(data[piece], video_webpage)
2695 if mobj is not None:
2696 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2700 for fmt in self._available_formats:
2701 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2702 if mobj is not None:
2703 # URL is in a Javascript segment inside an escaped Unicode format within
2704 # the generally utf-8 page
2705 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2706 video_info['video_urls'] = video_urls
2710 def _real_initialize(self):
2711 if self._downloader is None:
2716 downloader_params = self._downloader.params
2718 # Attempt to use provided username and password or .netrc data
2719 if downloader_params.get('username', None) is not None:
2720 useremail = downloader_params['username']
2721 password = downloader_params['password']
2722 elif downloader_params.get('usenetrc', False):
2724 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2725 if info is not None:
2729 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2730 except (IOError, netrc.NetrcParseError), err:
2731 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2734 if useremail is None:
2743 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2746 login_results = urllib2.urlopen(request).read()
2747 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2748 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2750 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2751 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2754 def _real_extract(self, url):
2755 mobj = re.match(self._VALID_URL, url)
2757 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2759 video_id = mobj.group('ID')
2762 self.report_video_webpage_download(video_id)
2763 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2765 page = urllib2.urlopen(request)
2766 video_webpage = page.read()
2767 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2768 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2771 # Start extracting information
2772 self.report_information_extraction(video_id)
2774 # Extract information
2775 video_info = self._parse_page(video_webpage)
2778 if 'owner' not in video_info:
2779 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2781 video_uploader = video_info['owner']
2784 if 'title' not in video_info:
2785 self._downloader.trouble(u'ERROR: unable to extract video title')
2787 video_title = video_info['title']
2788 video_title = video_title.decode('utf-8')
2789 video_title = sanitize_title(video_title)
2792 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2793 simple_title = simple_title.strip(ur'_')
2796 if 'thumbnail' not in video_info:
2797 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2798 video_thumbnail = ''
2800 video_thumbnail = video_info['thumbnail']
2804 if 'upload_date' in video_info:
2805 upload_time = video_info['upload_date']
2806 timetuple = email.utils.parsedate_tz(upload_time)
2807 if timetuple is not None:
2809 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2814 video_description = video_info.get('description', 'No description available.')
2816 url_map = video_info['video_urls']
2817 if len(url_map.keys()) > 0:
2818 # Decide which formats to download
2819 req_format = self._downloader.params.get('format', None)
2820 format_limit = self._downloader.params.get('format_limit', None)
2822 if format_limit is not None and format_limit in self._available_formats:
2823 format_list = self._available_formats[self._available_formats.index(format_limit):]
2825 format_list = self._available_formats
2826 existing_formats = [x for x in format_list if x in url_map]
2827 if len(existing_formats) == 0:
2828 self._downloader.trouble(u'ERROR: no known formats available for video')
2830 if req_format is None:
2831 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2832 elif req_format == '-1':
2833 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2836 if req_format not in url_map:
2837 self._downloader.trouble(u'ERROR: requested format not available')
2839 video_url_list = [(req_format, url_map[req_format])] # Specific format
2841 for format_param, video_real_url in video_url_list:
2843 # At this point we have a new video
2844 self._downloader.increment_downloads()
2847 video_extension = self._video_extensions.get(format_param, 'mp4')
2850 # Process video information
2851 self._downloader.process_info({
2852 'id': video_id.decode('utf-8'),
2853 'url': video_real_url.decode('utf-8'),
2854 'uploader': video_uploader.decode('utf-8'),
2855 'upload_date': upload_date,
2856 'title': video_title,
2857 'stitle': simple_title,
2858 'ext': video_extension.decode('utf-8'),
2859 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2860 'thumbnail': video_thumbnail.decode('utf-8'),
2861 'description': video_description.decode('utf-8'),
2864 except UnavailableVideoError, err:
2865 self._downloader.trouble(u'\nERROR: unable to download video')
2867 class BlipTVIE(InfoExtractor):
2868 """Information extractor for blip.tv"""
2870 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2871 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2872 IE_NAME = u'blip.tv'
2874 def report_extraction(self, file_id):
2875 """Report information extraction."""
2876 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2878 def _simplify_title(self, title):
2879 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2880 res = res.strip(ur'_')
2883 def _real_extract(self, url):
2884 mobj = re.match(self._VALID_URL, url)
2886 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2893 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2894 request = urllib2.Request(json_url)
2895 self.report_extraction(mobj.group(1))
2897 json_code = urllib2.urlopen(request).read()
2898 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2899 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2902 json_data = json.loads(json_code)
2903 if 'Post' in json_data:
2904 data = json_data['Post']
2908 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2909 video_url = data['media']['url']
2910 umobj = re.match(self._URL_EXT, video_url)
2912 raise ValueError('Can not determine filename extension')
2913 ext = umobj.group(1)
2915 self._downloader.increment_downloads()
2918 'id': data['item_id'],
2920 'uploader': data['display_name'],
2921 'upload_date': upload_date,
2922 'title': data['title'],
2923 'stitle': self._simplify_title(data['title']),
2925 'format': data['media']['mimeType'],
2926 'thumbnail': data['thumbnailUrl'],
2927 'description': data['description'],
2928 'player_url': data['embedUrl']
2930 except (ValueError,KeyError), err:
2931 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2935 self._downloader.process_info(info)
2936 except UnavailableVideoError, err:
2937 self._downloader.trouble(u'\nERROR: unable to download video')
2940 class MyVideoIE(InfoExtractor):
2941 """Information Extractor for myvideo.de."""
2943 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2944 IE_NAME = u'myvideo'
2946 def __init__(self, downloader=None):
2947 InfoExtractor.__init__(self, downloader)
2949 def report_download_webpage(self, video_id):
2950 """Report webpage download."""
2951 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2953 def report_extraction(self, video_id):
2954 """Report information extraction."""
2955 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2957 def _real_initialize(self):
2960 def _real_extract(self,url):
2961 mobj = re.match(self._VALID_URL, url)
2963 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2966 video_id = mobj.group(1)
2967 simple_title = mobj.group(2).decode('utf-8')
2968 # should actually not be necessary
2969 simple_title = sanitize_title(simple_title)
2970 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2973 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2975 self.report_download_webpage(video_id)
2976 webpage = urllib2.urlopen(request).read()
2977 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2978 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2981 self.report_extraction(video_id)
2982 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2985 self._downloader.trouble(u'ERROR: unable to extract media URL')
2987 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2989 mobj = re.search('<title>([^<]+)</title>', webpage)
2991 self._downloader.trouble(u'ERROR: unable to extract title')
2994 video_title = mobj.group(1)
2995 video_title = sanitize_title(video_title)
2999 self._downloader.process_info({
3003 'upload_date': u'NA',
3004 'title': video_title,
3005 'stitle': simple_title,
3010 except UnavailableVideoError:
3011 self._downloader.trouble(u'\nERROR: Unable to download video')
3013 class ComedyCentralIE(InfoExtractor):
3014 """Information extractor for The Daily Show and Colbert Report """
3016 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3017 IE_NAME = u'comedycentral'
3019 def report_extraction(self, episode_id):
3020 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3022 def report_config_download(self, episode_id):
3023 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3025 def report_index_download(self, episode_id):
3026 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3028 def report_player_url(self, episode_id):
3029 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3031 def _simplify_title(self, title):
3032 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3033 res = res.strip(ur'_')
3036 def _real_extract(self, url):
3037 mobj = re.match(self._VALID_URL, url)
3039 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3042 if mobj.group('shortname'):
3043 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3044 url = 'http://www.thedailyshow.com/full-episodes/'
3046 url = 'http://www.colbertnation.com/full-episodes/'
3047 mobj = re.match(self._VALID_URL, url)
3048 assert mobj is not None
3050 dlNewest = not mobj.group('episode')
3052 epTitle = mobj.group('showname')
3054 epTitle = mobj.group('episode')
3056 req = urllib2.Request(url)
3057 self.report_extraction(epTitle)
3059 htmlHandle = urllib2.urlopen(req)
3060 html = htmlHandle.read()
3061 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3062 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3065 url = htmlHandle.geturl()
3066 mobj = re.match(self._VALID_URL, url)
3068 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3070 if mobj.group('episode') == '':
3071 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3073 epTitle = mobj.group('episode')
3075 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3076 if len(mMovieParams) == 0:
3077 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3080 playerUrl_raw = mMovieParams[0][0]
3081 self.report_player_url(epTitle)
3083 urlHandle = urllib2.urlopen(playerUrl_raw)
3084 playerUrl = urlHandle.geturl()
3085 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3086 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3089 uri = mMovieParams[0][1]
3090 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3091 self.report_index_download(epTitle)
3093 indexXml = urllib2.urlopen(indexUrl).read()
3094 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3095 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3098 idoc = xml.etree.ElementTree.fromstring(indexXml)
3099 itemEls = idoc.findall('.//item')
3100 for itemEl in itemEls:
3101 mediaId = itemEl.findall('./guid')[0].text
3102 shortMediaId = mediaId.split(':')[-1]
3103 showId = mediaId.split(':')[-2].replace('.com', '')
3104 officialTitle = itemEl.findall('./title')[0].text
3105 officialDate = itemEl.findall('./pubDate')[0].text
3107 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3108 urllib.urlencode({'uri': mediaId}))
3109 configReq = urllib2.Request(configUrl)
3110 self.report_config_download(epTitle)
3112 configXml = urllib2.urlopen(configReq).read()
3113 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3114 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3117 cdoc = xml.etree.ElementTree.fromstring(configXml)
3119 for rendition in cdoc.findall('.//rendition'):
3120 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3124 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3127 # For now, just pick the highest bitrate
3128 format,video_url = turls[-1]
3130 self._downloader.increment_downloads()
3132 effTitle = showId + '-' + epTitle
3137 'upload_date': officialDate,
3139 'stitle': self._simplify_title(effTitle),
3143 'description': officialTitle,
3144 'player_url': playerUrl
3148 self._downloader.process_info(info)
3149 except UnavailableVideoError, err:
3150 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3154 class EscapistIE(InfoExtractor):
3155 """Information extractor for The Escapist """
3157 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3158 IE_NAME = u'escapist'
3160 def report_extraction(self, showName):
3161 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3163 def report_config_download(self, showName):
3164 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3166 def _simplify_title(self, title):
3167 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3168 res = res.strip(ur'_')
3171 def _real_extract(self, url):
3172 htmlParser = HTMLParser.HTMLParser()
3174 mobj = re.match(self._VALID_URL, url)
3176 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3178 showName = mobj.group('showname')
3179 videoId = mobj.group('episode')
3181 self.report_extraction(showName)
3183 webPage = urllib2.urlopen(url).read()
3184 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3185 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3188 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3189 description = htmlParser.unescape(descMatch.group(1))
3190 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3191 imgUrl = htmlParser.unescape(imgMatch.group(1))
3192 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3193 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3194 configUrlMatch = re.search('config=(.*)$', playerUrl)
3195 configUrl = urllib2.unquote(configUrlMatch.group(1))
3197 self.report_config_download(showName)
3199 configJSON = urllib2.urlopen(configUrl).read()
3200 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3201 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3204 # Technically, it's JavaScript, not JSON
3205 configJSON = configJSON.replace("'", '"')
3208 config = json.loads(configJSON)
3209 except (ValueError,), err:
3210 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3213 playlist = config['playlist']
3214 videoUrl = playlist[1]['url']
3216 self._downloader.increment_downloads()
3220 'uploader': showName,
3221 'upload_date': None,
3223 'stitle': self._simplify_title(showName),
3226 'thumbnail': imgUrl,
3227 'description': description,
3228 'player_url': playerUrl,
3232 self._downloader.process_info(info)
3233 except UnavailableVideoError, err:
3234 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3238 class PostProcessor(object):
3239 """Post Processor class.
3241 PostProcessor objects can be added to downloaders with their
3242 add_post_processor() method. When the downloader has finished a
3243 successful download, it will take its internal chain of PostProcessors
3244 and start calling the run() method on each one of them, first with
3245 an initial argument and then with the returned value of the previous
3248 The chain will be stopped if one of them ever returns None or the end
3249 of the chain is reached.
3251 PostProcessor objects follow a "mutual registration" process similar
3252 to InfoExtractor objects.
3257 def __init__(self, downloader=None):
3258 self._downloader = downloader
3260 def set_downloader(self, downloader):
3261 """Sets the downloader for this PP."""
3262 self._downloader = downloader
3264 def run(self, information):
3265 """Run the PostProcessor.
3267 The "information" argument is a dictionary like the ones
3268 composed by InfoExtractors. The only difference is that this
3269 one has an extra field called "filepath" that points to the
3272 When this method returns None, the postprocessing chain is
3273 stopped. However, this method may return an information
3274 dictionary that will be passed to the next postprocessing
3275 object in the chain. It can be the one it received after
3276 changing some fields.
3278 In addition, this method may raise a PostProcessingError
3279 exception that will be taken into account by the downloader
3282 return information # by default, do nothing
3285 class FFmpegExtractAudioPP(PostProcessor):
3287 def __init__(self, downloader=None, preferredcodec=None):
3288 PostProcessor.__init__(self, downloader)
3289 if preferredcodec is None:
3290 preferredcodec = 'best'
3291 self._preferredcodec = preferredcodec
3294 def get_audio_codec(path):
3296 cmd = ['ffprobe', '-show_streams', '--', path]
3297 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3298 output = handle.communicate()[0]
3299 if handle.wait() != 0:
3301 except (IOError, OSError):
3304 for line in output.split('\n'):
3305 if line.startswith('codec_name='):
3306 audio_codec = line.split('=')[1].strip()
3307 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3312 def run_ffmpeg(path, out_path, codec, more_opts):
3314 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3315 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3317 except (IOError, OSError):
3320 def run(self, information):
3321 path = information['filepath']
3323 filecodec = self.get_audio_codec(path)
3324 if filecodec is None:
3325 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3329 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3330 if filecodec == 'aac' or filecodec == 'mp3':
3331 # Lossless if possible
3333 extension = filecodec
3334 if filecodec == 'aac':
3335 more_opts = ['-f', 'adts']
3338 acodec = 'libmp3lame'
3340 more_opts = ['-ab', '128k']
3342 # We convert the audio (lossy)
3343 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3344 extension = self._preferredcodec
3345 more_opts = ['-ab', '128k']
3346 if self._preferredcodec == 'aac':
3347 more_opts += ['-f', 'adts']
3349 (prefix, ext) = os.path.splitext(path)
3350 new_path = prefix + '.' + extension
3351 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3352 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3355 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3360 except (IOError, OSError):
3361 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3364 information['filepath'] = new_path
3368 def updateSelf(downloader, filename):
3369 ''' Update the program file with the latest version from the repository '''
3370 # Note: downloader only used for options
3371 if not os.access(filename, os.W_OK):
3372 sys.exit('ERROR: no write permissions on %s' % filename)
3374 downloader.to_screen('Updating to latest version...')
3378 urlh = urllib.urlopen(UPDATE_URL)
3379 newcontent = urlh.read()
3382 except (IOError, OSError), err:
3383 sys.exit('ERROR: unable to download latest version')
3386 outf = open(filename, 'wb')
3388 outf.write(newcontent)
3391 except (IOError, OSError), err:
3392 sys.exit('ERROR: unable to overwrite current version')
3394 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3401 def _format_option_string(option):
3402 ''' ('-o', '--option') -> -o, --format METAVAR'''
3406 if option._short_opts: opts.append(option._short_opts[0])
3407 if option._long_opts: opts.append(option._long_opts[0])
3408 if len(opts) > 1: opts.insert(1, ', ')
3410 if option.takes_value(): opts.append(' %s' % option.metavar)
3412 return "".join(opts)
3414 def _find_term_columns():
3415 columns = os.environ.get('COLUMNS', None)
3420 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3421 out,err = sp.communicate()
3422 return int(out.split()[1])
3428 max_help_position = 80
3430 # No need to wrap help messages if we're on a wide console
3431 columns = _find_term_columns()
3432 if columns: max_width = columns
3434 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3435 fmt.format_option_strings = _format_option_string
3438 'version' : __version__,
3440 'usage' : '%prog [options] url [url...]',
3441 'conflict_handler' : 'resolve',
3444 parser = optparse.OptionParser(**kw)
3447 general = optparse.OptionGroup(parser, 'General Options')
3448 selection = optparse.OptionGroup(parser, 'Video Selection')
3449 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3450 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3451 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3452 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3453 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3455 general.add_option('-h', '--help',
3456 action='help', help='print this help text and exit')
3457 general.add_option('-v', '--version',
3458 action='version', help='print program version and exit')
3459 general.add_option('-U', '--update',
3460 action='store_true', dest='update_self', help='update this program to latest version')
3461 general.add_option('-i', '--ignore-errors',
3462 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3463 general.add_option('-r', '--rate-limit',
3464 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3465 general.add_option('-R', '--retries',
3466 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3467 general.add_option('--dump-user-agent',
3468 action='store_true', dest='dump_user_agent',
3469 help='display the current browser identification', default=False)
3470 general.add_option('--list-extractors',
3471 action='store_true', dest='list_extractors',
3472 help='List all supported extractors and the URLs they would handle', default=False)
3474 selection.add_option('--playlist-start',
3475 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3476 selection.add_option('--playlist-end',
3477 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3478 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3479 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3481 authentication.add_option('-u', '--username',
3482 dest='username', metavar='USERNAME', help='account username')
3483 authentication.add_option('-p', '--password',
3484 dest='password', metavar='PASSWORD', help='account password')
3485 authentication.add_option('-n', '--netrc',
3486 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3489 video_format.add_option('-f', '--format',
3490 action='store', dest='format', metavar='FORMAT', help='video format code')
3491 video_format.add_option('--all-formats',
3492 action='store_const', dest='format', help='download all available video formats', const='-1')
3493 video_format.add_option('--max-quality',
3494 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3497 verbosity.add_option('-q', '--quiet',
3498 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3499 verbosity.add_option('-s', '--simulate',
3500 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3501 verbosity.add_option('--skip-download',
3502 action='store_true', dest='skip_download', help='do not download the video', default=False)
3503 verbosity.add_option('-g', '--get-url',
3504 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3505 verbosity.add_option('-e', '--get-title',
3506 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3507 verbosity.add_option('--get-thumbnail',
3508 action='store_true', dest='getthumbnail',
3509 help='simulate, quiet but print thumbnail URL', default=False)
3510 verbosity.add_option('--get-description',
3511 action='store_true', dest='getdescription',
3512 help='simulate, quiet but print video description', default=False)
3513 verbosity.add_option('--get-filename',
3514 action='store_true', dest='getfilename',
3515 help='simulate, quiet but print output filename', default=False)
3516 verbosity.add_option('--no-progress',
3517 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3518 verbosity.add_option('--console-title',
3519 action='store_true', dest='consoletitle',
3520 help='display progress in console titlebar', default=False)
3523 filesystem.add_option('-t', '--title',
3524 action='store_true', dest='usetitle', help='use title in file name', default=False)
3525 filesystem.add_option('-l', '--literal',
3526 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3527 filesystem.add_option('-A', '--auto-number',
3528 action='store_true', dest='autonumber',
3529 help='number downloaded files starting from 00000', default=False)
3530 filesystem.add_option('-o', '--output',
3531 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3532 filesystem.add_option('-a', '--batch-file',
3533 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3534 filesystem.add_option('-w', '--no-overwrites',
3535 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3536 filesystem.add_option('-c', '--continue',
3537 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3538 filesystem.add_option('--cookies',
3539 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3540 filesystem.add_option('--no-part',
3541 action='store_true', dest='nopart', help='do not use .part files', default=False)
3542 filesystem.add_option('--no-mtime',
3543 action='store_false', dest='updatetime',
3544 help='do not use the Last-modified header to set the file modification time', default=True)
3545 filesystem.add_option('--write-description',
3546 action='store_true', dest='writedescription',
3547 help='write video description to a .description file', default=False)
3548 filesystem.add_option('--write-info-json',
3549 action='store_true', dest='writeinfojson',
3550 help='write video metadata to a .info.json file', default=False)
3553 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3554 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3555 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3556 help='"best", "aac" or "mp3"; best by default')
3559 parser.add_option_group(general)
3560 parser.add_option_group(selection)
3561 parser.add_option_group(filesystem)
3562 parser.add_option_group(verbosity)
3563 parser.add_option_group(video_format)
3564 parser.add_option_group(authentication)
3565 parser.add_option_group(postproc)
3567 opts, args = parser.parse_args()
3569 return parser, opts, args
3571 def gen_extractors():
3572 """ Return a list of an instance of every supported extractor.
3573 The order does matter; the first extractor matched is the one handling the URL.
3575 youtube_ie = YoutubeIE()
3576 google_ie = GoogleIE()
3577 yahoo_ie = YahooIE()
3580 MetacafeIE(youtube_ie),
3582 YoutubePlaylistIE(youtube_ie),
3583 YoutubeUserIE(youtube_ie),
3584 YoutubeSearchIE(youtube_ie),
3586 GoogleSearchIE(google_ie),
3589 YahooSearchIE(yahoo_ie),
3602 parser, opts, args = parseOpts()
3604 # Open appropriate CookieJar
3605 if opts.cookiefile is None:
3606 jar = cookielib.CookieJar()
3609 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3610 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3612 except (IOError, OSError), err:
3613 sys.exit(u'ERROR: unable to open cookie file')
3616 if opts.dump_user_agent:
3617 print std_headers['User-Agent']
3620 # Batch file verification
3622 if opts.batchfile is not None:
3624 if opts.batchfile == '-':
3627 batchfd = open(opts.batchfile, 'r')
3628 batchurls = batchfd.readlines()
3629 batchurls = [x.strip() for x in batchurls]
3630 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3632 sys.exit(u'ERROR: batch file could not be read')
3633 all_urls = batchurls + args
3635 # General configuration
3636 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3637 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3638 urllib2.install_opener(opener)
3639 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3641 extractors = gen_extractors()
3643 if opts.list_extractors:
3644 for ie in extractors:
3646 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3647 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3648 for mu in matchedUrls:
3652 # Conflicting, missing and erroneous options
3653 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3654 parser.error(u'using .netrc conflicts with giving username/password')
3655 if opts.password is not None and opts.username is None:
3656 parser.error(u'account username missing')
3657 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3658 parser.error(u'using output template conflicts with using title, literal title or auto number')
3659 if opts.usetitle and opts.useliteral:
3660 parser.error(u'using title conflicts with using literal title')
3661 if opts.username is not None and opts.password is None:
3662 opts.password = getpass.getpass(u'Type account password and press return:')
3663 if opts.ratelimit is not None:
3664 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3665 if numeric_limit is None:
3666 parser.error(u'invalid rate limit specified')
3667 opts.ratelimit = numeric_limit
3668 if opts.retries is not None:
3670 opts.retries = long(opts.retries)
3671 except (TypeError, ValueError), err:
3672 parser.error(u'invalid retry count specified')
3674 opts.playliststart = int(opts.playliststart)
3675 if opts.playliststart <= 0:
3676 raise ValueError(u'Playlist start must be positive')
3677 except (TypeError, ValueError), err:
3678 parser.error(u'invalid playlist start number specified')
3680 opts.playlistend = int(opts.playlistend)
3681 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3682 raise ValueError(u'Playlist end must be greater than playlist start')
3683 except (TypeError, ValueError), err:
3684 parser.error(u'invalid playlist end number specified')
3685 if opts.extractaudio:
3686 if opts.audioformat not in ['best', 'aac', 'mp3']:
3687 parser.error(u'invalid audio format specified')
3690 fd = FileDownloader({
3691 'usenetrc': opts.usenetrc,
3692 'username': opts.username,
3693 'password': opts.password,
3694 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3695 'forceurl': opts.geturl,
3696 'forcetitle': opts.gettitle,
3697 'forcethumbnail': opts.getthumbnail,
3698 'forcedescription': opts.getdescription,
3699 'forcefilename': opts.getfilename,
3700 'simulate': opts.simulate,
3701 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3702 'format': opts.format,
3703 'format_limit': opts.format_limit,
3704 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3705 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3706 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3707 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3708 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3709 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3710 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3711 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3712 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3713 or u'%(id)s.%(ext)s'),
3714 'ignoreerrors': opts.ignoreerrors,
3715 'ratelimit': opts.ratelimit,
3716 'nooverwrites': opts.nooverwrites,
3717 'retries': opts.retries,
3718 'continuedl': opts.continue_dl,
3719 'noprogress': opts.noprogress,
3720 'playliststart': opts.playliststart,
3721 'playlistend': opts.playlistend,
3722 'logtostderr': opts.outtmpl == '-',
3723 'consoletitle': opts.consoletitle,
3724 'nopart': opts.nopart,
3725 'updatetime': opts.updatetime,
3726 'writedescription': opts.writedescription,
3727 'writeinfojson': opts.writeinfojson,
3728 'matchtitle': opts.matchtitle,
3729 'rejecttitle': opts.rejecttitle,
3731 for extractor in extractors:
3732 fd.add_info_extractor(extractor)
3735 if opts.extractaudio:
3736 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3739 if opts.update_self:
3740 updateSelf(fd, sys.argv[0])
3743 if len(all_urls) < 1:
3744 if not opts.update_self:
3745 parser.error(u'you must provide at least one URL')
3748 retcode = fd.download(all_urls)
3750 # Dump cookie jar if requested
3751 if opts.cookiefile is not None:
3754 except (IOError, OSError), err:
3755 sys.exit(u'ERROR: unable to save cookie jar')
3760 if __name__ == '__main__':
3763 except DownloadError:
3765 except SameFileError:
3766 sys.exit(u'ERROR: fixed output name but more than one file to download')
3767 except KeyboardInterrupt:
3768 sys.exit(u'\nERROR: Interrupted by user')
3770 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: