2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.16'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
635 def report_writedescription(self, descfn):
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
643 def report_destination(self, filename):
644 """Report destination filename."""
645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
649 if self.params.get('noprogress', False):
651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
656 def report_resuming_byte(self, resume_len):
657 """Report attempt to resume at given byte."""
658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
660 def report_retry(self, count, retries):
661 """Report retry in case of HTTP error 5xx"""
662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
668 except (UnicodeEncodeError), err:
669 self.to_screen(u'[download] The file has already been downloaded')
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
673 self.to_screen(u'[download] Unable to resume')
675 def report_finish(self):
676 """Report download finished."""
677 if self.params.get('noprogress', False):
678 self.to_screen(u'[download] Download completed')
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
700 filename = self.prepare_filename(info_dict)
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forceformat', False):
714 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
716 # Do nothing else if in simulate mode
717 if self.params.get('simulate', False):
723 matchtitle=self.params.get('matchtitle',False)
724 rejecttitle=self.params.get('rejecttitle',False)
725 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
729 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
733 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734 self.to_stderr(u'WARNING: file exists and will be skipped')
738 dn = os.path.dirname(filename)
739 if dn != '' and not os.path.exists(dn):
741 except (OSError, IOError), err:
742 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
745 if self.params.get('writedescription', False):
747 descfn = filename + '.description'
748 self.report_writedescription(descfn)
749 descfile = open(descfn, 'wb')
751 descfile.write(info_dict['description'].encode('utf-8'))
754 except (OSError, IOError):
755 self.trouble(u'ERROR: Cannot write description file ' + descfn)
758 if self.params.get('writeinfojson', False):
759 infofn = filename + '.info.json'
760 self.report_writeinfojson(infofn)
763 except (NameError,AttributeError):
764 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
767 infof = open(infofn, 'wb')
769 json.dump(info_dict, infof)
772 except (OSError, IOError):
773 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
776 if not self.params.get('skip_download', False):
778 success,add_data = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
779 info_dict.update(add_data)
780 except (OSError, IOError), err:
781 raise UnavailableVideoError
782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
785 except (ContentTooShortError, ), err:
786 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
791 self.post_process(filename, info_dict)
792 except (PostProcessingError), err:
793 self.trouble(u'ERROR: postprocessing: %s' % str(err))
796 def download(self, url_list):
797 """Download a given list of URLs."""
798 if len(url_list) > 1 and self.fixed_template():
799 raise SameFileError(self.params['outtmpl'])
802 suitable_found = False
804 # Go to next InfoExtractor if not suitable
805 if not ie.suitable(url):
808 # Suitable InfoExtractor found
809 suitable_found = True
811 # Extract information from URL and process it
814 # Suitable InfoExtractor had been found; go to next URL
817 if not suitable_found:
818 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
820 return self._download_retcode
822 def post_process(self, filename, ie_info):
823 """Run the postprocessing chain on the given file."""
825 info['filepath'] = filename
831 def _download_with_rtmpdump(self, filename, url, player_url):
832 self.report_destination(filename)
833 tmpfilename = self.temp_name(filename)
835 # Check for rtmpdump first
837 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
838 except (OSError, IOError):
839 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
842 # Download using rtmpdump. rtmpdump returns exit code 2 when
843 # the connection was interrumpted and resuming appears to be
844 # possible. This is part of rtmpdump's normal usage, AFAIK.
845 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
846 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
847 while retval == 2 or retval == 1:
848 prevsize = os.path.getsize(tmpfilename)
849 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
850 time.sleep(5.0) # This seems to be needed
851 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
852 cursize = os.path.getsize(tmpfilename)
853 if prevsize == cursize and retval == 1:
855 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
856 if prevsize == cursize and retval == 2 and cursize > 1024:
857 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
861 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
862 self.try_rename(tmpfilename, filename)
865 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
868 def _do_download(self, filename, url, player_url):
869 # Check file already present
870 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
871 self.report_file_already_downloaded(filename)
874 # Attempt to download using rtmpdump
875 if url.startswith('rtmp'):
876 return self._download_with_rtmpdump(filename, url, player_url)
878 tmpfilename = self.temp_name(filename)
882 # Do not include the Accept-Encoding header
883 headers = {'Youtubedl-no-compression': 'True'}
884 basic_request = urllib2.Request(url, None, headers)
885 request = urllib2.Request(url, None, headers)
887 # Establish possible resume length
888 if os.path.isfile(tmpfilename):
889 resume_len = os.path.getsize(tmpfilename)
893 # Request parameters in case of being able to resume
894 if self.params.get('continuedl', False) and resume_len != 0:
895 self.report_resuming_byte(resume_len)
896 request.add_header('Range', 'bytes=%d-' % resume_len)
900 retries = self.params.get('retries', 0)
901 while count <= retries:
902 # Establish connection
904 data = urllib2.urlopen(request)
906 except (urllib2.HTTPError, ), err:
907 if (err.code < 500 or err.code >= 600) and err.code != 416:
908 # Unexpected HTTP error
910 elif err.code == 416:
911 # Unable to resume (requested range not satisfiable)
913 # Open the connection again without the range header
914 data = urllib2.urlopen(basic_request)
915 content_length = data.info()['Content-Length']
916 except (urllib2.HTTPError, ), err:
917 if err.code < 500 or err.code >= 600:
920 # Examine the reported length
921 if (content_length is not None and
922 (resume_len - 100 < long(content_length) < resume_len + 100)):
923 # The file had already been fully downloaded.
924 # Explanation to the above condition: in issue #175 it was revealed that
925 # YouTube sometimes adds or removes a few bytes from the end of the file,
926 # changing the file size slightly and causing problems for some users. So
927 # I decided to implement a suggested change and consider the file
928 # completely downloaded if the file size differs less than 100 bytes from
929 # the one in the hard drive.
930 self.report_file_already_downloaded(filename)
931 self.try_rename(tmpfilename, filename)
934 # The length does not match, we start the download over
935 self.report_unable_to_resume()
941 self.report_retry(count, retries)
944 self.trouble(u'ERROR: giving up after %s retries' % retries)
947 data_len = data.info().get('Content-length', None)
948 if data_len is not None:
949 data_len = long(data_len) + resume_len
950 data_len_str = self.format_bytes(data_len)
951 byte_counter = 0 + resume_len
957 data_block = data.read(block_size)
959 if len(data_block) == 0:
961 byte_counter += len(data_block)
963 # Open file just in time
966 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
967 assert stream is not None
968 filename = self.undo_temp_name(tmpfilename)
969 self.report_destination(filename)
970 except (OSError, IOError), err:
971 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
974 stream.write(data_block)
975 except (IOError, OSError), err:
976 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
978 block_size = self.best_block_size(after - before, len(data_block))
981 percent_str = self.calc_percent(byte_counter, data_len)
982 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
983 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
984 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
987 self.slow_down(start, byte_counter - resume_len)
990 self.trouble(u'\nERROR: Did not get any data blocks')
994 if data_len is not None and byte_counter != data_len:
995 raise ContentTooShortError(byte_counter, long(data_len))
996 self.try_rename(tmpfilename, filename)
998 # Update file modification time
1000 if self.params.get('updatetime', True):
1001 filetime = self.try_utime(filename, data.info().get('last-modified', None))
1003 return True, {'filetime': filetime}
1006 class InfoExtractor(object):
1007 """Information Extractor class.
1009 Information extractors are the classes that, given a URL, extract
1010 information from the video (or videos) the URL refers to. This
1011 information includes the real video URL, the video title and simplified
1012 title, author and others. The information is stored in a dictionary
1013 which is then passed to the FileDownloader. The FileDownloader
1014 processes this information possibly downloading the video to the file
1015 system, among other possible outcomes. The dictionaries must include
1016 the following fields:
1018 id: Video identifier.
1019 url: Final video URL.
1020 uploader: Nickname of the video uploader.
1021 title: Literal title.
1022 stitle: Simplified title.
1023 ext: Video filename extension.
1024 format: Video format.
1025 player_url: SWF Player URL (may be None).
1027 The following fields are optional. Their primary purpose is to allow
1028 youtube-dl to serve as the backend for a video search function, such
1029 as the one in youtube2mp3. They are only used when their respective
1030 forced printing functions are called:
1032 thumbnail: Full URL to a video thumbnail image.
1033 description: One-line video description.
1035 Subclasses of this one should re-define the _real_initialize() and
1036 _real_extract() methods and define a _VALID_URL regexp.
1037 Probably, they should also be added to the list of extractors.
1043 def __init__(self, downloader=None):
1044 """Constructor. Receives an optional downloader."""
1046 self.set_downloader(downloader)
1048 def suitable(self, url):
1049 """Receives a URL and returns True if suitable for this IE."""
1050 return re.match(self._VALID_URL, url) is not None
1052 def initialize(self):
1053 """Initializes an instance (authentication, etc)."""
1055 self._real_initialize()
1058 def extract(self, url):
1059 """Extracts URL information and returns it in list of dicts."""
1061 return self._real_extract(url)
1063 def set_downloader(self, downloader):
1064 """Sets the downloader for this IE."""
1065 self._downloader = downloader
1067 def _real_initialize(self):
1068 """Real initialization process. Redefine in subclasses."""
1071 def _real_extract(self, url):
1072 """Real extraction process. Redefine in subclasses."""
1076 class YoutubeIE(InfoExtractor):
1077 """Information extractor for youtube.com."""
1079 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1080 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1081 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1082 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1083 _NETRC_MACHINE = 'youtube'
1084 # Listed in order of quality
1085 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1086 _video_extensions = {
1092 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1096 IE_NAME = u'youtube'
1098 def report_lang(self):
1099 """Report attempt to set language."""
1100 self._downloader.to_screen(u'[youtube] Setting language')
1102 def report_login(self):
1103 """Report attempt to log in."""
1104 self._downloader.to_screen(u'[youtube] Logging in')
1106 def report_age_confirmation(self):
1107 """Report attempt to confirm age."""
1108 self._downloader.to_screen(u'[youtube] Confirming age')
1110 def report_video_webpage_download(self, video_id):
1111 """Report attempt to download video webpage."""
1112 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1114 def report_video_info_webpage_download(self, video_id):
1115 """Report attempt to download video info webpage."""
1116 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1118 def report_information_extraction(self, video_id):
1119 """Report attempt to extract video information."""
1120 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1122 def report_unavailable_format(self, video_id, format):
1123 """Report extracted video URL."""
1124 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1126 def report_rtmp_download(self):
1127 """Indicate the download will use the RTMP protocol."""
1128 self._downloader.to_screen(u'[youtube] RTMP download detected')
1130 def _real_initialize(self):
1131 if self._downloader is None:
1136 downloader_params = self._downloader.params
1138 # Attempt to use provided username and password or .netrc data
1139 if downloader_params.get('username', None) is not None:
1140 username = downloader_params['username']
1141 password = downloader_params['password']
1142 elif downloader_params.get('usenetrc', False):
1144 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1145 if info is not None:
1149 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1150 except (IOError, netrc.NetrcParseError), err:
1151 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1155 request = urllib2.Request(self._LANG_URL)
1158 urllib2.urlopen(request).read()
1159 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1160 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1163 # No authentication to be performed
1164 if username is None:
1169 'current_form': 'loginForm',
1171 'action_login': 'Log In',
1172 'username': username,
1173 'password': password,
1175 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1178 login_results = urllib2.urlopen(request).read()
1179 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1180 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1182 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1183 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1189 'action_confirm': 'Confirm',
1191 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1193 self.report_age_confirmation()
1194 age_results = urllib2.urlopen(request).read()
1195 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1196 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1199 def _real_extract(self, url):
1200 # Extract video id from URL
1201 mobj = re.match(self._VALID_URL, url)
1203 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1205 video_id = mobj.group(2)
1208 self.report_video_webpage_download(video_id)
1209 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1211 video_webpage = urllib2.urlopen(request).read()
1212 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1213 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1216 # Attempt to extract SWF player URL
1217 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1218 if mobj is not None:
1219 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1224 self.report_video_info_webpage_download(video_id)
1225 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1226 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1227 % (video_id, el_type))
1228 request = urllib2.Request(video_info_url)
1230 video_info_webpage = urllib2.urlopen(request).read()
1231 video_info = parse_qs(video_info_webpage)
1232 if 'token' in video_info:
1234 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1235 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1237 if 'token' not in video_info:
1238 if 'reason' in video_info:
1239 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1241 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1244 # Start extracting information
1245 self.report_information_extraction(video_id)
1248 if 'author' not in video_info:
1249 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1251 video_uploader = urllib.unquote_plus(video_info['author'][0])
1254 if 'title' not in video_info:
1255 self._downloader.trouble(u'ERROR: unable to extract video title')
1257 video_title = urllib.unquote_plus(video_info['title'][0])
1258 video_title = video_title.decode('utf-8')
1259 video_title = sanitize_title(video_title)
1262 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1263 simple_title = simple_title.strip(ur'_')
1266 if 'thumbnail_url' not in video_info:
1267 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1268 video_thumbnail = ''
1269 else: # don't panic if we can't find it
1270 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1274 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1275 if mobj is not None:
1276 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1277 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1278 for expression in format_expressions:
1280 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1288 video_description = u'No description available.'
1289 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1290 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1291 if mobj is not None:
1292 video_description = mobj.group(1).decode('utf-8')
1294 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1295 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1296 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1297 # TODO use another parser
1300 video_token = urllib.unquote_plus(video_info['token'][0])
1302 # Decide which formats to download
1303 req_format = self._downloader.params.get('format', None)
1305 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1306 self.report_rtmp_download()
1307 video_url_list = [(None, video_info['conn'][0])]
1308 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1309 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1310 url_data = [parse_qs(uds) for uds in url_data_strs]
1311 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1312 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1314 format_limit = self._downloader.params.get('format_limit', None)
1315 if format_limit is not None and format_limit in self._available_formats:
1316 format_list = self._available_formats[self._available_formats.index(format_limit):]
1318 format_list = self._available_formats
1319 existing_formats = [x for x in format_list if x in url_map]
1320 if len(existing_formats) == 0:
1321 self._downloader.trouble(u'ERROR: no known formats available for video')
1323 if req_format is None or req_format == 'best':
1324 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1325 elif req_format == 'worst':
1326 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1327 elif req_format in ('-1', 'all'):
1328 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1330 # Specific formats. We pick the first in a slash-delimeted sequence.
1331 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1332 req_formats = req_format.split('/')
1333 video_url_list = None
1334 for rf in req_formats:
1336 video_url_list = [(rf, url_map[rf])]
1338 if video_url_list is None:
1339 self._downloader.trouble(u'ERROR: requested format not available')
1342 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1345 for format_param, video_real_url in video_url_list:
1346 # At this point we have a new video
1347 self._downloader.increment_downloads()
1350 video_extension = self._video_extensions.get(format_param, 'flv')
1353 # Process video information
1354 self._downloader.process_info({
1355 'id': video_id.decode('utf-8'),
1356 'url': video_real_url.decode('utf-8'),
1357 'uploader': video_uploader.decode('utf-8'),
1358 'upload_date': upload_date,
1359 'title': video_title,
1360 'stitle': simple_title,
1361 'ext': video_extension.decode('utf-8'),
1362 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1363 'thumbnail': video_thumbnail.decode('utf-8'),
1364 'description': video_description,
1365 'player_url': player_url,
1367 except UnavailableVideoError, err:
1368 self._downloader.trouble(u'\nERROR: unable to download video')
1371 class MetacafeIE(InfoExtractor):
1372 """Information Extractor for metacafe.com."""
1374 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1375 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1376 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1378 IE_NAME = u'metacafe'
1380 def __init__(self, youtube_ie, downloader=None):
1381 InfoExtractor.__init__(self, downloader)
1382 self._youtube_ie = youtube_ie
1384 def report_disclaimer(self):
1385 """Report disclaimer retrieval."""
1386 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1388 def report_age_confirmation(self):
1389 """Report attempt to confirm age."""
1390 self._downloader.to_screen(u'[metacafe] Confirming age')
1392 def report_download_webpage(self, video_id):
1393 """Report webpage download."""
1394 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1396 def report_extraction(self, video_id):
1397 """Report information extraction."""
1398 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1400 def _real_initialize(self):
1401 # Retrieve disclaimer
1402 request = urllib2.Request(self._DISCLAIMER)
1404 self.report_disclaimer()
1405 disclaimer = urllib2.urlopen(request).read()
1406 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1407 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1413 'submit': "Continue - I'm over 18",
1415 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1417 self.report_age_confirmation()
1418 disclaimer = urllib2.urlopen(request).read()
1419 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1420 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1423 def _real_extract(self, url):
1424 # Extract id and simplified title from URL
1425 mobj = re.match(self._VALID_URL, url)
1427 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1430 video_id = mobj.group(1)
1432 # Check if video comes from YouTube
1433 mobj2 = re.match(r'^yt-(.*)$', video_id)
1434 if mobj2 is not None:
1435 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1438 # At this point we have a new video
1439 self._downloader.increment_downloads()
1441 simple_title = mobj.group(2).decode('utf-8')
1443 # Retrieve video webpage to extract further information
1444 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1446 self.report_download_webpage(video_id)
1447 webpage = urllib2.urlopen(request).read()
1448 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1449 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1452 # Extract URL, uploader and title from webpage
1453 self.report_extraction(video_id)
1454 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1455 if mobj is not None:
1456 mediaURL = urllib.unquote(mobj.group(1))
1457 video_extension = mediaURL[-3:]
1459 # Extract gdaKey if available
1460 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1462 video_url = mediaURL
1464 gdaKey = mobj.group(1)
1465 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1467 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1469 self._downloader.trouble(u'ERROR: unable to extract media URL')
1471 vardict = parse_qs(mobj.group(1))
1472 if 'mediaData' not in vardict:
1473 self._downloader.trouble(u'ERROR: unable to extract media URL')
1475 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1477 self._downloader.trouble(u'ERROR: unable to extract media URL')
1479 mediaURL = mobj.group(1).replace('\\/', '/')
1480 video_extension = mediaURL[-3:]
1481 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1483 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1485 self._downloader.trouble(u'ERROR: unable to extract title')
1487 video_title = mobj.group(1).decode('utf-8')
1488 video_title = sanitize_title(video_title)
1490 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1492 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1494 video_uploader = mobj.group(1)
1497 # Process video information
1498 self._downloader.process_info({
1499 'id': video_id.decode('utf-8'),
1500 'url': video_url.decode('utf-8'),
1501 'uploader': video_uploader.decode('utf-8'),
1502 'upload_date': u'NA',
1503 'title': video_title,
1504 'stitle': simple_title,
1505 'ext': video_extension.decode('utf-8'),
1509 except UnavailableVideoError:
1510 self._downloader.trouble(u'\nERROR: unable to download video')
1513 class DailymotionIE(InfoExtractor):
1514 """Information Extractor for Dailymotion"""
1516 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1517 IE_NAME = u'dailymotion'
1519 def __init__(self, downloader=None):
1520 InfoExtractor.__init__(self, downloader)
1522 def report_download_webpage(self, video_id):
1523 """Report webpage download."""
1524 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1526 def report_extraction(self, video_id):
1527 """Report information extraction."""
1528 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1530 def _real_initialize(self):
1533 def _real_extract(self, url):
1534 # Extract id and simplified title from URL
1535 mobj = re.match(self._VALID_URL, url)
1537 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1540 # At this point we have a new video
1541 self._downloader.increment_downloads()
1542 video_id = mobj.group(1)
1544 simple_title = mobj.group(2).decode('utf-8')
1545 video_extension = 'flv'
1547 # Retrieve video webpage to extract further information
1548 request = urllib2.Request(url)
1549 request.add_header('Cookie', 'family_filter=off')
1551 self.report_download_webpage(video_id)
1552 webpage = urllib2.urlopen(request).read()
1553 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1554 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1557 # Extract URL, uploader and title from webpage
1558 self.report_extraction(video_id)
1559 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1561 self._downloader.trouble(u'ERROR: unable to extract media URL')
1563 sequence = urllib.unquote(mobj.group(1))
1564 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1566 self._downloader.trouble(u'ERROR: unable to extract media URL')
1568 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1570 # if needed add http://www.dailymotion.com/ if relative URL
1572 video_url = mediaURL
1574 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1576 self._downloader.trouble(u'ERROR: unable to extract title')
1578 video_title = mobj.group(1).decode('utf-8')
1579 video_title = sanitize_title(video_title)
1581 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1583 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1585 video_uploader = mobj.group(1)
1588 # Process video information
1589 self._downloader.process_info({
1590 'id': video_id.decode('utf-8'),
1591 'url': video_url.decode('utf-8'),
1592 'uploader': video_uploader.decode('utf-8'),
1593 'upload_date': u'NA',
1594 'title': video_title,
1595 'stitle': simple_title,
1596 'ext': video_extension.decode('utf-8'),
1600 except UnavailableVideoError:
1601 self._downloader.trouble(u'\nERROR: unable to download video')
1604 class GoogleIE(InfoExtractor):
1605 """Information extractor for video.google.com."""
1607 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1608 IE_NAME = u'video.google'
1610 def __init__(self, downloader=None):
1611 InfoExtractor.__init__(self, downloader)
1613 def report_download_webpage(self, video_id):
1614 """Report webpage download."""
1615 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1617 def report_extraction(self, video_id):
1618 """Report information extraction."""
1619 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1621 def _real_initialize(self):
1624 def _real_extract(self, url):
1625 # Extract id from URL
1626 mobj = re.match(self._VALID_URL, url)
1628 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1631 # At this point we have a new video
1632 self._downloader.increment_downloads()
1633 video_id = mobj.group(1)
1635 video_extension = 'mp4'
1637 # Retrieve video webpage to extract further information
1638 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1640 self.report_download_webpage(video_id)
1641 webpage = urllib2.urlopen(request).read()
1642 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1643 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1646 # Extract URL, uploader, and title from webpage
1647 self.report_extraction(video_id)
1648 mobj = re.search(r"download_url:'([^']+)'", webpage)
1650 video_extension = 'flv'
1651 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1653 self._downloader.trouble(u'ERROR: unable to extract media URL')
1655 mediaURL = urllib.unquote(mobj.group(1))
1656 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1657 mediaURL = mediaURL.replace('\\x26', '\x26')
1659 video_url = mediaURL
1661 mobj = re.search(r'<title>(.*)</title>', webpage)
1663 self._downloader.trouble(u'ERROR: unable to extract title')
1665 video_title = mobj.group(1).decode('utf-8')
1666 video_title = sanitize_title(video_title)
1667 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1669 # Extract video description
1670 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1672 self._downloader.trouble(u'ERROR: unable to extract video description')
1674 video_description = mobj.group(1).decode('utf-8')
1675 if not video_description:
1676 video_description = 'No description available.'
1678 # Extract video thumbnail
1679 if self._downloader.params.get('forcethumbnail', False):
1680 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1682 webpage = urllib2.urlopen(request).read()
1683 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1684 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1686 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1688 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1690 video_thumbnail = mobj.group(1)
1691 else: # we need something to pass to process_info
1692 video_thumbnail = ''
1695 # Process video information
1696 self._downloader.process_info({
1697 'id': video_id.decode('utf-8'),
1698 'url': video_url.decode('utf-8'),
1700 'upload_date': u'NA',
1701 'title': video_title,
1702 'stitle': simple_title,
1703 'ext': video_extension.decode('utf-8'),
1707 except UnavailableVideoError:
1708 self._downloader.trouble(u'\nERROR: unable to download video')
1711 class PhotobucketIE(InfoExtractor):
1712 """Information extractor for photobucket.com."""
1714 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1715 IE_NAME = u'photobucket'
1717 def __init__(self, downloader=None):
1718 InfoExtractor.__init__(self, downloader)
1720 def report_download_webpage(self, video_id):
1721 """Report webpage download."""
1722 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1724 def report_extraction(self, video_id):
1725 """Report information extraction."""
1726 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1728 def _real_initialize(self):
1731 def _real_extract(self, url):
1732 # Extract id from URL
1733 mobj = re.match(self._VALID_URL, url)
1735 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1738 # At this point we have a new video
1739 self._downloader.increment_downloads()
1740 video_id = mobj.group(1)
1742 video_extension = 'flv'
1744 # Retrieve video webpage to extract further information
1745 request = urllib2.Request(url)
1747 self.report_download_webpage(video_id)
1748 webpage = urllib2.urlopen(request).read()
1749 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1750 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1753 # Extract URL, uploader, and title from webpage
1754 self.report_extraction(video_id)
1755 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1757 self._downloader.trouble(u'ERROR: unable to extract media URL')
1759 mediaURL = urllib.unquote(mobj.group(1))
1761 video_url = mediaURL
1763 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1765 self._downloader.trouble(u'ERROR: unable to extract title')
1767 video_title = mobj.group(1).decode('utf-8')
1768 video_title = sanitize_title(video_title)
1769 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1771 video_uploader = mobj.group(2).decode('utf-8')
1774 # Process video information
1775 self._downloader.process_info({
1776 'id': video_id.decode('utf-8'),
1777 'url': video_url.decode('utf-8'),
1778 'uploader': video_uploader,
1779 'upload_date': u'NA',
1780 'title': video_title,
1781 'stitle': simple_title,
1782 'ext': video_extension.decode('utf-8'),
1786 except UnavailableVideoError:
1787 self._downloader.trouble(u'\nERROR: unable to download video')
1790 class YahooIE(InfoExtractor):
1791 """Information extractor for video.yahoo.com."""
1793 # _VALID_URL matches all Yahoo! Video URLs
1794 # _VPAGE_URL matches only the extractable '/watch/' URLs
1795 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1796 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1797 IE_NAME = u'video.yahoo'
1799 def __init__(self, downloader=None):
1800 InfoExtractor.__init__(self, downloader)
1802 def report_download_webpage(self, video_id):
1803 """Report webpage download."""
1804 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1806 def report_extraction(self, video_id):
1807 """Report information extraction."""
1808 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1810 def _real_initialize(self):
1813 def _real_extract(self, url, new_video=True):
1814 # Extract ID from URL
1815 mobj = re.match(self._VALID_URL, url)
1817 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1820 # At this point we have a new video
1821 self._downloader.increment_downloads()
1822 video_id = mobj.group(2)
1823 video_extension = 'flv'
1825 # Rewrite valid but non-extractable URLs as
1826 # extractable English language /watch/ URLs
1827 if re.match(self._VPAGE_URL, url) is None:
1828 request = urllib2.Request(url)
1830 webpage = urllib2.urlopen(request).read()
1831 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1832 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1835 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1837 self._downloader.trouble(u'ERROR: Unable to extract id field')
1839 yahoo_id = mobj.group(1)
1841 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1843 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1845 yahoo_vid = mobj.group(1)
1847 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1848 return self._real_extract(url, new_video=False)
1850 # Retrieve video webpage to extract further information
1851 request = urllib2.Request(url)
1853 self.report_download_webpage(video_id)
1854 webpage = urllib2.urlopen(request).read()
1855 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1856 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1859 # Extract uploader and title from webpage
1860 self.report_extraction(video_id)
1861 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1863 self._downloader.trouble(u'ERROR: unable to extract video title')
1865 video_title = mobj.group(1).decode('utf-8')
1866 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1868 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1870 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1872 video_uploader = mobj.group(1).decode('utf-8')
1874 # Extract video thumbnail
1875 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1877 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1879 video_thumbnail = mobj.group(1).decode('utf-8')
1881 # Extract video description
1882 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1884 self._downloader.trouble(u'ERROR: unable to extract video description')
1886 video_description = mobj.group(1).decode('utf-8')
1887 if not video_description:
1888 video_description = 'No description available.'
1890 # Extract video height and width
1891 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1893 self._downloader.trouble(u'ERROR: unable to extract video height')
1895 yv_video_height = mobj.group(1)
1897 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1899 self._downloader.trouble(u'ERROR: unable to extract video width')
1901 yv_video_width = mobj.group(1)
1903 # Retrieve video playlist to extract media URL
1904 # I'm not completely sure what all these options are, but we
1905 # seem to need most of them, otherwise the server sends a 401.
1906 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1907 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1908 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1909 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1910 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1912 self.report_download_webpage(video_id)
1913 webpage = urllib2.urlopen(request).read()
1914 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1915 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1918 # Extract media URL from playlist XML
1919 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1921 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1923 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1924 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1927 # Process video information
1928 self._downloader.process_info({
1929 'id': video_id.decode('utf-8'),
1931 'uploader': video_uploader,
1932 'upload_date': u'NA',
1933 'title': video_title,
1934 'stitle': simple_title,
1935 'ext': video_extension.decode('utf-8'),
1936 'thumbnail': video_thumbnail.decode('utf-8'),
1937 'description': video_description,
1938 'thumbnail': video_thumbnail,
1941 except UnavailableVideoError:
1942 self._downloader.trouble(u'\nERROR: unable to download video')
1945 class VimeoIE(InfoExtractor):
1946 """Information extractor for vimeo.com."""
1948 # _VALID_URL matches Vimeo URLs
1949 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1952 def __init__(self, downloader=None):
1953 InfoExtractor.__init__(self, downloader)
1955 def report_download_webpage(self, video_id):
1956 """Report webpage download."""
1957 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1959 def report_extraction(self, video_id):
1960 """Report information extraction."""
1961 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1963 def _real_initialize(self):
1966 def _real_extract(self, url, new_video=True):
1967 # Extract ID from URL
1968 mobj = re.match(self._VALID_URL, url)
1970 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1973 # At this point we have a new video
1974 self._downloader.increment_downloads()
1975 video_id = mobj.group(1)
1977 # Retrieve video webpage to extract further information
1978 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1980 self.report_download_webpage(video_id)
1981 webpage = urllib2.urlopen(request).read()
1982 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1983 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1986 # Now we begin extracting as much information as we can from what we
1987 # retrieved. First we extract the information common to all extractors,
1988 # and latter we extract those that are Vimeo specific.
1989 self.report_extraction(video_id)
1992 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1994 self._downloader.trouble(u'ERROR: unable to extract video title')
1996 video_title = mobj.group(1).decode('utf-8')
1997 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2000 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2002 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2004 video_uploader = mobj.group(1).decode('utf-8')
2006 # Extract video thumbnail
2007 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2009 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2011 video_thumbnail = mobj.group(1).decode('utf-8')
2013 # # Extract video description
2014 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2016 # self._downloader.trouble(u'ERROR: unable to extract video description')
2018 # video_description = mobj.group(1).decode('utf-8')
2019 # if not video_description: video_description = 'No description available.'
2020 video_description = 'Foo.'
2022 # Vimeo specific: extract request signature
2023 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2025 self._downloader.trouble(u'ERROR: unable to extract request signature')
2027 sig = mobj.group(1).decode('utf-8')
2029 # Vimeo specific: Extract request signature expiration
2030 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2032 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2034 sig_exp = mobj.group(1).decode('utf-8')
2036 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2039 # Process video information
2040 self._downloader.process_info({
2041 'id': video_id.decode('utf-8'),
2043 'uploader': video_uploader,
2044 'upload_date': u'NA',
2045 'title': video_title,
2046 'stitle': simple_title,
2048 'thumbnail': video_thumbnail.decode('utf-8'),
2049 'description': video_description,
2050 'thumbnail': video_thumbnail,
2051 'description': video_description,
2054 except UnavailableVideoError:
2055 self._downloader.trouble(u'ERROR: unable to download video')
2058 class GenericIE(InfoExtractor):
2059 """Generic last-resort information extractor."""
2062 IE_NAME = u'generic'
2064 def __init__(self, downloader=None):
2065 InfoExtractor.__init__(self, downloader)
2067 def report_download_webpage(self, video_id):
2068 """Report webpage download."""
2069 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2070 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2072 def report_extraction(self, video_id):
2073 """Report information extraction."""
2074 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2076 def _real_initialize(self):
2079 def _real_extract(self, url):
2080 # At this point we have a new video
2081 self._downloader.increment_downloads()
2083 video_id = url.split('/')[-1]
2084 request = urllib2.Request(url)
2086 self.report_download_webpage(video_id)
2087 webpage = urllib2.urlopen(request).read()
2088 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2089 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2091 except ValueError, err:
2092 # since this is the last-resort InfoExtractor, if
2093 # this error is thrown, it'll be thrown here
2094 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2097 self.report_extraction(video_id)
2098 # Start with something easy: JW Player in SWFObject
2099 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2101 # Broaden the search a little bit
2102 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2104 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2107 # It's possible that one of the regexes
2108 # matched, but returned an empty group:
2109 if mobj.group(1) is None:
2110 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2113 video_url = urllib.unquote(mobj.group(1))
2114 video_id = os.path.basename(video_url)
2116 # here's a fun little line of code for you:
2117 video_extension = os.path.splitext(video_id)[1][1:]
2118 video_id = os.path.splitext(video_id)[0]
2120 # it's tempting to parse this further, but you would
2121 # have to take into account all the variations like
2122 # Video Title - Site Name
2123 # Site Name | Video Title
2124 # Video Title - Tagline | Site Name
2125 # and so on and so forth; it's just not practical
2126 mobj = re.search(r'<title>(.*)</title>', webpage)
2128 self._downloader.trouble(u'ERROR: unable to extract title')
2130 video_title = mobj.group(1).decode('utf-8')
2131 video_title = sanitize_title(video_title)
2132 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2134 # video uploader is domain name
2135 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2137 self._downloader.trouble(u'ERROR: unable to extract title')
2139 video_uploader = mobj.group(1).decode('utf-8')
2142 # Process video information
2143 self._downloader.process_info({
2144 'id': video_id.decode('utf-8'),
2145 'url': video_url.decode('utf-8'),
2146 'uploader': video_uploader,
2147 'upload_date': u'NA',
2148 'title': video_title,
2149 'stitle': simple_title,
2150 'ext': video_extension.decode('utf-8'),
2154 except UnavailableVideoError, err:
2155 self._downloader.trouble(u'\nERROR: unable to download video')
2158 class YoutubeSearchIE(InfoExtractor):
2159 """Information Extractor for YouTube search queries."""
2160 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2161 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2162 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2163 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2165 _max_youtube_results = 1000
2166 IE_NAME = u'youtube:search'
2168 def __init__(self, youtube_ie, downloader=None):
2169 InfoExtractor.__init__(self, downloader)
2170 self._youtube_ie = youtube_ie
2172 def report_download_page(self, query, pagenum):
2173 """Report attempt to download playlist page with given number."""
2174 query = query.decode(preferredencoding())
2175 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2177 def _real_initialize(self):
2178 self._youtube_ie.initialize()
2180 def _real_extract(self, query):
2181 mobj = re.match(self._VALID_URL, query)
2183 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2186 prefix, query = query.split(':')
2188 query = query.encode('utf-8')
2190 self._download_n_results(query, 1)
2192 elif prefix == 'all':
2193 self._download_n_results(query, self._max_youtube_results)
2199 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2201 elif n > self._max_youtube_results:
2202 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2203 n = self._max_youtube_results
2204 self._download_n_results(query, n)
2206 except ValueError: # parsing prefix as integer fails
2207 self._download_n_results(query, 1)
2210 def _download_n_results(self, query, n):
2211 """Downloads a specified number of results for a query"""
2214 already_seen = set()
2218 self.report_download_page(query, pagenum)
2219 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2220 request = urllib2.Request(result_url)
2222 page = urllib2.urlopen(request).read()
2223 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2224 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2227 # Extract video identifiers
2228 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2229 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2230 if video_id not in already_seen:
2231 video_ids.append(video_id)
2232 already_seen.add(video_id)
2233 if len(video_ids) == n:
2234 # Specified n videos reached
2235 for id in video_ids:
2236 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2239 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2240 for id in video_ids:
2241 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2244 pagenum = pagenum + 1
2247 class GoogleSearchIE(InfoExtractor):
2248 """Information Extractor for Google Video search queries."""
2249 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2250 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2251 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2252 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2254 _max_google_results = 1000
2255 IE_NAME = u'video.google:search'
2257 def __init__(self, google_ie, downloader=None):
2258 InfoExtractor.__init__(self, downloader)
2259 self._google_ie = google_ie
2261 def report_download_page(self, query, pagenum):
2262 """Report attempt to download playlist page with given number."""
2263 query = query.decode(preferredencoding())
2264 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2266 def _real_initialize(self):
2267 self._google_ie.initialize()
2269 def _real_extract(self, query):
2270 mobj = re.match(self._VALID_URL, query)
2272 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2275 prefix, query = query.split(':')
2277 query = query.encode('utf-8')
2279 self._download_n_results(query, 1)
2281 elif prefix == 'all':
2282 self._download_n_results(query, self._max_google_results)
2288 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2290 elif n > self._max_google_results:
2291 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2292 n = self._max_google_results
2293 self._download_n_results(query, n)
2295 except ValueError: # parsing prefix as integer fails
2296 self._download_n_results(query, 1)
2299 def _download_n_results(self, query, n):
2300 """Downloads a specified number of results for a query"""
2303 already_seen = set()
2307 self.report_download_page(query, pagenum)
2308 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2309 request = urllib2.Request(result_url)
2311 page = urllib2.urlopen(request).read()
2312 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2313 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2316 # Extract video identifiers
2317 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2318 video_id = mobj.group(1)
2319 if video_id not in already_seen:
2320 video_ids.append(video_id)
2321 already_seen.add(video_id)
2322 if len(video_ids) == n:
2323 # Specified n videos reached
2324 for id in video_ids:
2325 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2328 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2329 for id in video_ids:
2330 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2333 pagenum = pagenum + 1
2336 class YahooSearchIE(InfoExtractor):
2337 """Information Extractor for Yahoo! Video search queries."""
2338 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2339 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2340 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2341 _MORE_PAGES_INDICATOR = r'\s*Next'
2343 _max_yahoo_results = 1000
2344 IE_NAME = u'video.yahoo:search'
2346 def __init__(self, yahoo_ie, downloader=None):
2347 InfoExtractor.__init__(self, downloader)
2348 self._yahoo_ie = yahoo_ie
2350 def report_download_page(self, query, pagenum):
2351 """Report attempt to download playlist page with given number."""
2352 query = query.decode(preferredencoding())
2353 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2355 def _real_initialize(self):
2356 self._yahoo_ie.initialize()
2358 def _real_extract(self, query):
2359 mobj = re.match(self._VALID_URL, query)
2361 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2364 prefix, query = query.split(':')
2366 query = query.encode('utf-8')
2368 self._download_n_results(query, 1)
2370 elif prefix == 'all':
2371 self._download_n_results(query, self._max_yahoo_results)
2377 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2379 elif n > self._max_yahoo_results:
2380 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2381 n = self._max_yahoo_results
2382 self._download_n_results(query, n)
2384 except ValueError: # parsing prefix as integer fails
2385 self._download_n_results(query, 1)
2388 def _download_n_results(self, query, n):
2389 """Downloads a specified number of results for a query"""
2392 already_seen = set()
2396 self.report_download_page(query, pagenum)
2397 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2398 request = urllib2.Request(result_url)
2400 page = urllib2.urlopen(request).read()
2401 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2402 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2405 # Extract video identifiers
2406 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2407 video_id = mobj.group(1)
2408 if video_id not in already_seen:
2409 video_ids.append(video_id)
2410 already_seen.add(video_id)
2411 if len(video_ids) == n:
2412 # Specified n videos reached
2413 for id in video_ids:
2414 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2417 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2418 for id in video_ids:
2419 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2422 pagenum = pagenum + 1
2425 class YoutubePlaylistIE(InfoExtractor):
2426 """Information Extractor for YouTube playlists."""
2428 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2429 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2430 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2431 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2433 IE_NAME = u'youtube:playlist'
2435 def __init__(self, youtube_ie, downloader=None):
2436 InfoExtractor.__init__(self, downloader)
2437 self._youtube_ie = youtube_ie
2439 def report_download_page(self, playlist_id, pagenum):
2440 """Report attempt to download playlist page with given number."""
2441 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2443 def _real_initialize(self):
2444 self._youtube_ie.initialize()
2446 def _real_extract(self, url):
2447 # Extract playlist id
2448 mobj = re.match(self._VALID_URL, url)
2450 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2454 if mobj.group(3) is not None:
2455 self._youtube_ie.extract(mobj.group(3))
2458 # Download playlist pages
2459 # prefix is 'p' as default for playlists but there are other types that need extra care
2460 playlist_prefix = mobj.group(1)
2461 if playlist_prefix == 'a':
2462 playlist_access = 'artist'
2464 playlist_prefix = 'p'
2465 playlist_access = 'view_play_list'
2466 playlist_id = mobj.group(2)
2471 self.report_download_page(playlist_id, pagenum)
2472 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2474 page = urllib2.urlopen(request).read()
2475 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2476 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2479 # Extract video identifiers
2481 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2482 if mobj.group(1) not in ids_in_page:
2483 ids_in_page.append(mobj.group(1))
2484 video_ids.extend(ids_in_page)
2486 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2488 pagenum = pagenum + 1
2490 playliststart = self._downloader.params.get('playliststart', 1) - 1
2491 playlistend = self._downloader.params.get('playlistend', -1)
2492 video_ids = video_ids[playliststart:playlistend]
2494 for id in video_ids:
2495 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2499 class YoutubeUserIE(InfoExtractor):
2500 """Information Extractor for YouTube users."""
2502 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2503 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2504 _GDATA_PAGE_SIZE = 50
2505 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2506 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2508 IE_NAME = u'youtube:user'
2510 def __init__(self, youtube_ie, downloader=None):
2511 InfoExtractor.__init__(self, downloader)
2512 self._youtube_ie = youtube_ie
2514 def report_download_page(self, username, start_index):
2515 """Report attempt to download user page."""
2516 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2517 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2519 def _real_initialize(self):
2520 self._youtube_ie.initialize()
2522 def _real_extract(self, url):
2524 mobj = re.match(self._VALID_URL, url)
2526 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2529 username = mobj.group(1)
2531 # Download video ids using YouTube Data API. Result size per
2532 # query is limited (currently to 50 videos) so we need to query
2533 # page by page until there are no video ids - it means we got
2540 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2541 self.report_download_page(username, start_index)
2543 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2546 page = urllib2.urlopen(request).read()
2547 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2548 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2551 # Extract video identifiers
2554 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2555 if mobj.group(1) not in ids_in_page:
2556 ids_in_page.append(mobj.group(1))
2558 video_ids.extend(ids_in_page)
2560 # A little optimization - if current page is not
2561 # "full", ie. does not contain PAGE_SIZE video ids then
2562 # we can assume that this page is the last one - there
2563 # are no more ids on further pages - no need to query
2566 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2571 all_ids_count = len(video_ids)
2572 playliststart = self._downloader.params.get('playliststart', 1) - 1
2573 playlistend = self._downloader.params.get('playlistend', -1)
2575 if playlistend == -1:
2576 video_ids = video_ids[playliststart:]
2578 video_ids = video_ids[playliststart:playlistend]
2580 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2581 (username, all_ids_count, len(video_ids)))
2583 for video_id in video_ids:
2584 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2587 class DepositFilesIE(InfoExtractor):
2588 """Information extractor for depositfiles.com"""
2590 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2591 IE_NAME = u'DepositFiles'
2593 def __init__(self, downloader=None):
2594 InfoExtractor.__init__(self, downloader)
2596 def report_download_webpage(self, file_id):
2597 """Report webpage download."""
2598 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2600 def report_extraction(self, file_id):
2601 """Report information extraction."""
2602 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2604 def _real_initialize(self):
2607 def _real_extract(self, url):
2608 # At this point we have a new file
2609 self._downloader.increment_downloads()
2611 file_id = url.split('/')[-1]
2612 # Rebuild url in english locale
2613 url = 'http://depositfiles.com/en/files/' + file_id
2615 # Retrieve file webpage with 'Free download' button pressed
2616 free_download_indication = { 'gateway_result' : '1' }
2617 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2619 self.report_download_webpage(file_id)
2620 webpage = urllib2.urlopen(request).read()
2621 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2622 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2625 # Search for the real file URL
2626 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2627 if (mobj is None) or (mobj.group(1) is None):
2628 # Try to figure out reason of the error.
2629 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2630 if (mobj is not None) and (mobj.group(1) is not None):
2631 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2632 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2634 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2637 file_url = mobj.group(1)
2638 file_extension = os.path.splitext(file_url)[1][1:]
2640 # Search for file title
2641 mobj = re.search(r'<b title="(.*?)">', webpage)
2643 self._downloader.trouble(u'ERROR: unable to extract title')
2645 file_title = mobj.group(1).decode('utf-8')
2648 # Process file information
2649 self._downloader.process_info({
2650 'id': file_id.decode('utf-8'),
2651 'url': file_url.decode('utf-8'),
2653 'upload_date': u'NA',
2654 'title': file_title,
2655 'stitle': file_title,
2656 'ext': file_extension.decode('utf-8'),
2660 except UnavailableVideoError, err:
2661 self._downloader.trouble(u'ERROR: unable to download file')
2664 class FacebookIE(InfoExtractor):
2665 """Information Extractor for Facebook"""
2667 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2668 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2669 _NETRC_MACHINE = 'facebook'
2670 _available_formats = ['highqual', 'lowqual']
2671 _video_extensions = {
2675 IE_NAME = u'facebook'
2677 def __init__(self, downloader=None):
2678 InfoExtractor.__init__(self, downloader)
2680 def _reporter(self, message):
2681 """Add header and report message."""
2682 self._downloader.to_screen(u'[facebook] %s' % message)
2684 def report_login(self):
2685 """Report attempt to log in."""
2686 self._reporter(u'Logging in')
2688 def report_video_webpage_download(self, video_id):
2689 """Report attempt to download video webpage."""
2690 self._reporter(u'%s: Downloading video webpage' % video_id)
2692 def report_information_extraction(self, video_id):
2693 """Report attempt to extract video information."""
2694 self._reporter(u'%s: Extracting video information' % video_id)
2696 def _parse_page(self, video_webpage):
2697 """Extract video information from page"""
2699 data = {'title': r'class="video_title datawrap">(.*?)</',
2700 'description': r'<div class="datawrap">(.*?)</div>',
2701 'owner': r'\("video_owner_name", "(.*?)"\)',
2702 'upload_date': r'data-date="(.*?)"',
2703 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2706 for piece in data.keys():
2707 mobj = re.search(data[piece], video_webpage)
2708 if mobj is not None:
2709 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2713 for fmt in self._available_formats:
2714 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2715 if mobj is not None:
2716 # URL is in a Javascript segment inside an escaped Unicode format within
2717 # the generally utf-8 page
2718 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2719 video_info['video_urls'] = video_urls
2723 def _real_initialize(self):
2724 if self._downloader is None:
2729 downloader_params = self._downloader.params
2731 # Attempt to use provided username and password or .netrc data
2732 if downloader_params.get('username', None) is not None:
2733 useremail = downloader_params['username']
2734 password = downloader_params['password']
2735 elif downloader_params.get('usenetrc', False):
2737 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2738 if info is not None:
2742 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2743 except (IOError, netrc.NetrcParseError), err:
2744 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2747 if useremail is None:
2756 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2759 login_results = urllib2.urlopen(request).read()
2760 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2761 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2763 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2764 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2767 def _real_extract(self, url):
2768 mobj = re.match(self._VALID_URL, url)
2770 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2772 video_id = mobj.group('ID')
2775 self.report_video_webpage_download(video_id)
2776 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2778 page = urllib2.urlopen(request)
2779 video_webpage = page.read()
2780 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2781 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2784 # Start extracting information
2785 self.report_information_extraction(video_id)
2787 # Extract information
2788 video_info = self._parse_page(video_webpage)
2791 if 'owner' not in video_info:
2792 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2794 video_uploader = video_info['owner']
2797 if 'title' not in video_info:
2798 self._downloader.trouble(u'ERROR: unable to extract video title')
2800 video_title = video_info['title']
2801 video_title = video_title.decode('utf-8')
2802 video_title = sanitize_title(video_title)
2805 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2806 simple_title = simple_title.strip(ur'_')
2809 if 'thumbnail' not in video_info:
2810 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2811 video_thumbnail = ''
2813 video_thumbnail = video_info['thumbnail']
2817 if 'upload_date' in video_info:
2818 upload_time = video_info['upload_date']
2819 timetuple = email.utils.parsedate_tz(upload_time)
2820 if timetuple is not None:
2822 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2827 video_description = video_info.get('description', 'No description available.')
2829 url_map = video_info['video_urls']
2830 if len(url_map.keys()) > 0:
2831 # Decide which formats to download
2832 req_format = self._downloader.params.get('format', None)
2833 format_limit = self._downloader.params.get('format_limit', None)
2835 if format_limit is not None and format_limit in self._available_formats:
2836 format_list = self._available_formats[self._available_formats.index(format_limit):]
2838 format_list = self._available_formats
2839 existing_formats = [x for x in format_list if x in url_map]
2840 if len(existing_formats) == 0:
2841 self._downloader.trouble(u'ERROR: no known formats available for video')
2843 if req_format is None:
2844 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2845 elif req_format == 'worst':
2846 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2847 elif req_format == '-1':
2848 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2851 if req_format not in url_map:
2852 self._downloader.trouble(u'ERROR: requested format not available')
2854 video_url_list = [(req_format, url_map[req_format])] # Specific format
2856 for format_param, video_real_url in video_url_list:
2858 # At this point we have a new video
2859 self._downloader.increment_downloads()
2862 video_extension = self._video_extensions.get(format_param, 'mp4')
2865 # Process video information
2866 self._downloader.process_info({
2867 'id': video_id.decode('utf-8'),
2868 'url': video_real_url.decode('utf-8'),
2869 'uploader': video_uploader.decode('utf-8'),
2870 'upload_date': upload_date,
2871 'title': video_title,
2872 'stitle': simple_title,
2873 'ext': video_extension.decode('utf-8'),
2874 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2875 'thumbnail': video_thumbnail.decode('utf-8'),
2876 'description': video_description.decode('utf-8'),
2879 except UnavailableVideoError, err:
2880 self._downloader.trouble(u'\nERROR: unable to download video')
2882 class BlipTVIE(InfoExtractor):
2883 """Information extractor for blip.tv"""
2885 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2886 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2887 IE_NAME = u'blip.tv'
2889 def report_extraction(self, file_id):
2890 """Report information extraction."""
2891 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2893 def _simplify_title(self, title):
2894 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2895 res = res.strip(ur'_')
2898 def _real_extract(self, url):
2899 mobj = re.match(self._VALID_URL, url)
2901 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2908 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2909 request = urllib2.Request(json_url)
2910 self.report_extraction(mobj.group(1))
2912 json_code = urllib2.urlopen(request).read()
2913 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2914 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2917 json_data = json.loads(json_code)
2918 if 'Post' in json_data:
2919 data = json_data['Post']
2923 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2924 video_url = data['media']['url']
2925 umobj = re.match(self._URL_EXT, video_url)
2927 raise ValueError('Can not determine filename extension')
2928 ext = umobj.group(1)
2930 self._downloader.increment_downloads()
2933 'id': data['item_id'],
2935 'uploader': data['display_name'],
2936 'upload_date': upload_date,
2937 'title': data['title'],
2938 'stitle': self._simplify_title(data['title']),
2940 'format': data['media']['mimeType'],
2941 'thumbnail': data['thumbnailUrl'],
2942 'description': data['description'],
2943 'player_url': data['embedUrl']
2945 except (ValueError,KeyError), err:
2946 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2950 self._downloader.process_info(info)
2951 except UnavailableVideoError, err:
2952 self._downloader.trouble(u'\nERROR: unable to download video')
2955 class MyVideoIE(InfoExtractor):
2956 """Information Extractor for myvideo.de."""
2958 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2959 IE_NAME = u'myvideo'
2961 def __init__(self, downloader=None):
2962 InfoExtractor.__init__(self, downloader)
2964 def report_download_webpage(self, video_id):
2965 """Report webpage download."""
2966 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2968 def report_extraction(self, video_id):
2969 """Report information extraction."""
2970 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2972 def _real_initialize(self):
2975 def _real_extract(self,url):
2976 mobj = re.match(self._VALID_URL, url)
2978 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2981 video_id = mobj.group(1)
2982 simple_title = mobj.group(2).decode('utf-8')
2983 # should actually not be necessary
2984 simple_title = sanitize_title(simple_title)
2985 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2988 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2990 self.report_download_webpage(video_id)
2991 webpage = urllib2.urlopen(request).read()
2992 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2993 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2996 self.report_extraction(video_id)
2997 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3000 self._downloader.trouble(u'ERROR: unable to extract media URL')
3002 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3004 mobj = re.search('<title>([^<]+)</title>', webpage)
3006 self._downloader.trouble(u'ERROR: unable to extract title')
3009 video_title = mobj.group(1)
3010 video_title = sanitize_title(video_title)
3014 self._downloader.process_info({
3018 'upload_date': u'NA',
3019 'title': video_title,
3020 'stitle': simple_title,
3025 except UnavailableVideoError:
3026 self._downloader.trouble(u'\nERROR: Unable to download video')
3028 class ComedyCentralIE(InfoExtractor):
3029 """Information extractor for The Daily Show and Colbert Report """
3031 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3032 IE_NAME = u'comedycentral'
3034 def report_extraction(self, episode_id):
3035 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3037 def report_config_download(self, episode_id):
3038 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3040 def report_index_download(self, episode_id):
3041 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3043 def report_player_url(self, episode_id):
3044 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3046 def _simplify_title(self, title):
3047 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3048 res = res.strip(ur'_')
3051 def _real_extract(self, url):
3052 mobj = re.match(self._VALID_URL, url)
3054 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3057 if mobj.group('shortname'):
3058 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3059 url = 'http://www.thedailyshow.com/full-episodes/'
3061 url = 'http://www.colbertnation.com/full-episodes/'
3062 mobj = re.match(self._VALID_URL, url)
3063 assert mobj is not None
3065 dlNewest = not mobj.group('episode')
3067 epTitle = mobj.group('showname')
3069 epTitle = mobj.group('episode')
3071 req = urllib2.Request(url)
3072 self.report_extraction(epTitle)
3074 htmlHandle = urllib2.urlopen(req)
3075 html = htmlHandle.read()
3076 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3077 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3080 url = htmlHandle.geturl()
3081 mobj = re.match(self._VALID_URL, url)
3083 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3085 if mobj.group('episode') == '':
3086 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3088 epTitle = mobj.group('episode')
3090 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3091 if len(mMovieParams) == 0:
3092 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3095 playerUrl_raw = mMovieParams[0][0]
3096 self.report_player_url(epTitle)
3098 urlHandle = urllib2.urlopen(playerUrl_raw)
3099 playerUrl = urlHandle.geturl()
3100 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3101 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3104 uri = mMovieParams[0][1]
3105 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3106 self.report_index_download(epTitle)
3108 indexXml = urllib2.urlopen(indexUrl).read()
3109 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3110 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3113 idoc = xml.etree.ElementTree.fromstring(indexXml)
3114 itemEls = idoc.findall('.//item')
3115 for itemEl in itemEls:
3116 mediaId = itemEl.findall('./guid')[0].text
3117 shortMediaId = mediaId.split(':')[-1]
3118 showId = mediaId.split(':')[-2].replace('.com', '')
3119 officialTitle = itemEl.findall('./title')[0].text
3120 officialDate = itemEl.findall('./pubDate')[0].text
3122 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3123 urllib.urlencode({'uri': mediaId}))
3124 configReq = urllib2.Request(configUrl)
3125 self.report_config_download(epTitle)
3127 configXml = urllib2.urlopen(configReq).read()
3128 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3129 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3132 cdoc = xml.etree.ElementTree.fromstring(configXml)
3134 for rendition in cdoc.findall('.//rendition'):
3135 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3139 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3142 # For now, just pick the highest bitrate
3143 format,video_url = turls[-1]
3145 self._downloader.increment_downloads()
3147 effTitle = showId + '-' + epTitle
3152 'upload_date': officialDate,
3154 'stitle': self._simplify_title(effTitle),
3158 'description': officialTitle,
3159 'player_url': playerUrl
3163 self._downloader.process_info(info)
3164 except UnavailableVideoError, err:
3165 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3169 class EscapistIE(InfoExtractor):
3170 """Information extractor for The Escapist """
3172 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3173 IE_NAME = u'escapist'
3175 def report_extraction(self, showName):
3176 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3178 def report_config_download(self, showName):
3179 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3181 def _simplify_title(self, title):
3182 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3183 res = res.strip(ur'_')
3186 def _real_extract(self, url):
3187 htmlParser = HTMLParser.HTMLParser()
3189 mobj = re.match(self._VALID_URL, url)
3191 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3193 showName = mobj.group('showname')
3194 videoId = mobj.group('episode')
3196 self.report_extraction(showName)
3198 webPage = urllib2.urlopen(url).read()
3199 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3200 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3203 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3204 description = htmlParser.unescape(descMatch.group(1))
3205 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3206 imgUrl = htmlParser.unescape(imgMatch.group(1))
3207 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3208 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3209 configUrlMatch = re.search('config=(.*)$', playerUrl)
3210 configUrl = urllib2.unquote(configUrlMatch.group(1))
3212 self.report_config_download(showName)
3214 configJSON = urllib2.urlopen(configUrl).read()
3215 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3216 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3219 # Technically, it's JavaScript, not JSON
3220 configJSON = configJSON.replace("'", '"')
3223 config = json.loads(configJSON)
3224 except (ValueError,), err:
3225 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3228 playlist = config['playlist']
3229 videoUrl = playlist[1]['url']
3231 self._downloader.increment_downloads()
3235 'uploader': showName,
3236 'upload_date': None,
3238 'stitle': self._simplify_title(showName),
3241 'thumbnail': imgUrl,
3242 'description': description,
3243 'player_url': playerUrl,
3247 self._downloader.process_info(info)
3248 except UnavailableVideoError, err:
3249 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3253 class PostProcessor(object):
3254 """Post Processor class.
3256 PostProcessor objects can be added to downloaders with their
3257 add_post_processor() method. When the downloader has finished a
3258 successful download, it will take its internal chain of PostProcessors
3259 and start calling the run() method on each one of them, first with
3260 an initial argument and then with the returned value of the previous
3263 The chain will be stopped if one of them ever returns None or the end
3264 of the chain is reached.
3266 PostProcessor objects follow a "mutual registration" process similar
3267 to InfoExtractor objects.
3272 def __init__(self, downloader=None):
3273 self._downloader = downloader
3275 def set_downloader(self, downloader):
3276 """Sets the downloader for this PP."""
3277 self._downloader = downloader
3279 def run(self, information):
3280 """Run the PostProcessor.
3282 The "information" argument is a dictionary like the ones
3283 composed by InfoExtractors. The only difference is that this
3284 one has an extra field called "filepath" that points to the
3287 When this method returns None, the postprocessing chain is
3288 stopped. However, this method may return an information
3289 dictionary that will be passed to the next postprocessing
3290 object in the chain. It can be the one it received after
3291 changing some fields.
3293 In addition, this method may raise a PostProcessingError
3294 exception that will be taken into account by the downloader
3297 return information # by default, do nothing
3300 class FFmpegExtractAudioPP(PostProcessor):
3302 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3303 PostProcessor.__init__(self, downloader)
3304 if preferredcodec is None:
3305 preferredcodec = 'best'
3306 self._preferredcodec = preferredcodec
3307 self._preferredquality = preferredquality
3308 self._keepvideo = keepvideo
3311 def get_audio_codec(path):
3313 cmd = ['ffprobe', '-show_streams', '--', path]
3314 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3315 output = handle.communicate()[0]
3316 if handle.wait() != 0:
3318 except (IOError, OSError):
3321 for line in output.split('\n'):
3322 if line.startswith('codec_name='):
3323 audio_codec = line.split('=')[1].strip()
3324 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3329 def run_ffmpeg(path, out_path, codec, more_opts):
3331 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3332 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3334 except (IOError, OSError):
3337 def run(self, information):
3338 path = information['filepath']
3340 filecodec = self.get_audio_codec(path)
3341 if filecodec is None:
3342 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3346 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3347 if filecodec == 'aac' or filecodec == 'mp3':
3348 # Lossless if possible
3350 extension = filecodec
3351 if filecodec == 'aac':
3352 more_opts = ['-f', 'adts']
3355 acodec = 'libmp3lame'
3358 if self._preferredquality is not None:
3359 more_opts += ['-ab', self._preferredquality]
3361 # We convert the audio (lossy)
3362 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3363 extension = self._preferredcodec
3365 if self._preferredquality is not None:
3366 more_opts += ['-ab', self._preferredquality]
3367 if self._preferredcodec == 'aac':
3368 more_opts += ['-f', 'adts']
3370 (prefix, ext) = os.path.splitext(path)
3371 new_path = prefix + '.' + extension
3372 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3373 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3376 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3379 # Try to update the date time for extracted audio file.
3380 if information.get('filetime') is not None:
3382 os.utime(new_path, (time.time(), information['filetime']))
3384 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3386 if not self._keepvideo:
3389 except (IOError, OSError):
3390 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3393 information['filepath'] = new_path
3397 def updateSelf(downloader, filename):
3398 ''' Update the program file with the latest version from the repository '''
3399 # Note: downloader only used for options
3400 if not os.access(filename, os.W_OK):
3401 sys.exit('ERROR: no write permissions on %s' % filename)
3403 downloader.to_screen('Updating to latest version...')
3407 urlh = urllib.urlopen(UPDATE_URL)
3408 newcontent = urlh.read()
3411 except (IOError, OSError), err:
3412 sys.exit('ERROR: unable to download latest version')
3415 outf = open(filename, 'wb')
3417 outf.write(newcontent)
3420 except (IOError, OSError), err:
3421 sys.exit('ERROR: unable to overwrite current version')
3423 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3430 def _format_option_string(option):
3431 ''' ('-o', '--option') -> -o, --format METAVAR'''
3435 if option._short_opts: opts.append(option._short_opts[0])
3436 if option._long_opts: opts.append(option._long_opts[0])
3437 if len(opts) > 1: opts.insert(1, ', ')
3439 if option.takes_value(): opts.append(' %s' % option.metavar)
3441 return "".join(opts)
3443 def _find_term_columns():
3444 columns = os.environ.get('COLUMNS', None)
3449 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3450 out,err = sp.communicate()
3451 return int(out.split()[1])
3457 max_help_position = 80
3459 # No need to wrap help messages if we're on a wide console
3460 columns = _find_term_columns()
3461 if columns: max_width = columns
3463 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3464 fmt.format_option_strings = _format_option_string
3467 'version' : __version__,
3469 'usage' : '%prog [options] url [url...]',
3470 'conflict_handler' : 'resolve',
3473 parser = optparse.OptionParser(**kw)
3476 general = optparse.OptionGroup(parser, 'General Options')
3477 selection = optparse.OptionGroup(parser, 'Video Selection')
3478 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3479 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3480 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3481 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3482 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3484 general.add_option('-h', '--help',
3485 action='help', help='print this help text and exit')
3486 general.add_option('-v', '--version',
3487 action='version', help='print program version and exit')
3488 general.add_option('-U', '--update',
3489 action='store_true', dest='update_self', help='update this program to latest version')
3490 general.add_option('-i', '--ignore-errors',
3491 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3492 general.add_option('-r', '--rate-limit',
3493 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3494 general.add_option('-R', '--retries',
3495 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3496 general.add_option('--dump-user-agent',
3497 action='store_true', dest='dump_user_agent',
3498 help='display the current browser identification', default=False)
3499 general.add_option('--list-extractors',
3500 action='store_true', dest='list_extractors',
3501 help='List all supported extractors and the URLs they would handle', default=False)
3503 selection.add_option('--playlist-start',
3504 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3505 selection.add_option('--playlist-end',
3506 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3507 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3508 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3510 authentication.add_option('-u', '--username',
3511 dest='username', metavar='USERNAME', help='account username')
3512 authentication.add_option('-p', '--password',
3513 dest='password', metavar='PASSWORD', help='account password')
3514 authentication.add_option('-n', '--netrc',
3515 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3518 video_format.add_option('-f', '--format',
3519 action='store', dest='format', metavar='FORMAT', help='video format code')
3520 video_format.add_option('--all-formats',
3521 action='store_const', dest='format', help='download all available video formats', const='all')
3522 video_format.add_option('--max-quality',
3523 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3526 verbosity.add_option('-q', '--quiet',
3527 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3528 verbosity.add_option('-s', '--simulate',
3529 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3530 verbosity.add_option('--skip-download',
3531 action='store_true', dest='skip_download', help='do not download the video', default=False)
3532 verbosity.add_option('-g', '--get-url',
3533 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3534 verbosity.add_option('-e', '--get-title',
3535 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3536 verbosity.add_option('--get-thumbnail',
3537 action='store_true', dest='getthumbnail',
3538 help='simulate, quiet but print thumbnail URL', default=False)
3539 verbosity.add_option('--get-description',
3540 action='store_true', dest='getdescription',
3541 help='simulate, quiet but print video description', default=False)
3542 verbosity.add_option('--get-filename',
3543 action='store_true', dest='getfilename',
3544 help='simulate, quiet but print output filename', default=False)
3545 verbosity.add_option('--get-format',
3546 action='store_true', dest='getformat',
3547 help='simulate, quiet but print output format', default=False)
3548 verbosity.add_option('--no-progress',
3549 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3550 verbosity.add_option('--console-title',
3551 action='store_true', dest='consoletitle',
3552 help='display progress in console titlebar', default=False)
3555 filesystem.add_option('-t', '--title',
3556 action='store_true', dest='usetitle', help='use title in file name', default=False)
3557 filesystem.add_option('-l', '--literal',
3558 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3559 filesystem.add_option('-A', '--auto-number',
3560 action='store_true', dest='autonumber',
3561 help='number downloaded files starting from 00000', default=False)
3562 filesystem.add_option('-o', '--output',
3563 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3564 filesystem.add_option('-a', '--batch-file',
3565 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3566 filesystem.add_option('-w', '--no-overwrites',
3567 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3568 filesystem.add_option('-c', '--continue',
3569 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3570 filesystem.add_option('--cookies',
3571 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3572 filesystem.add_option('--no-part',
3573 action='store_true', dest='nopart', help='do not use .part files', default=False)
3574 filesystem.add_option('--no-mtime',
3575 action='store_false', dest='updatetime',
3576 help='do not use the Last-modified header to set the file modification time', default=True)
3577 filesystem.add_option('--write-description',
3578 action='store_true', dest='writedescription',
3579 help='write video description to a .description file', default=False)
3580 filesystem.add_option('--write-info-json',
3581 action='store_true', dest='writeinfojson',
3582 help='write video metadata to a .info.json file', default=False)
3585 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3586 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3587 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3588 help='"best", "aac" or "mp3"; best by default')
3589 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3590 help='ffmpeg audio bitrate specification, 128k by default')
3591 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3592 help='keeps the video file on disk after the post-processing; the video is erased by default')
3595 parser.add_option_group(general)
3596 parser.add_option_group(selection)
3597 parser.add_option_group(filesystem)
3598 parser.add_option_group(verbosity)
3599 parser.add_option_group(video_format)
3600 parser.add_option_group(authentication)
3601 parser.add_option_group(postproc)
3603 opts, args = parser.parse_args()
3605 return parser, opts, args
3607 def gen_extractors():
3608 """ Return a list of an instance of every supported extractor.
3609 The order does matter; the first extractor matched is the one handling the URL.
3611 youtube_ie = YoutubeIE()
3612 google_ie = GoogleIE()
3613 yahoo_ie = YahooIE()
3616 MetacafeIE(youtube_ie),
3618 YoutubePlaylistIE(youtube_ie),
3619 YoutubeUserIE(youtube_ie),
3620 YoutubeSearchIE(youtube_ie),
3622 GoogleSearchIE(google_ie),
3625 YahooSearchIE(yahoo_ie),
3638 parser, opts, args = parseOpts()
3640 # Open appropriate CookieJar
3641 if opts.cookiefile is None:
3642 jar = cookielib.CookieJar()
3645 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3646 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3648 except (IOError, OSError), err:
3649 sys.exit(u'ERROR: unable to open cookie file')
3652 if opts.dump_user_agent:
3653 print std_headers['User-Agent']
3656 # Batch file verification
3658 if opts.batchfile is not None:
3660 if opts.batchfile == '-':
3663 batchfd = open(opts.batchfile, 'r')
3664 batchurls = batchfd.readlines()
3665 batchurls = [x.strip() for x in batchurls]
3666 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3668 sys.exit(u'ERROR: batch file could not be read')
3669 all_urls = batchurls + args
3671 # General configuration
3672 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3673 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3674 urllib2.install_opener(opener)
3675 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3677 extractors = gen_extractors()
3679 if opts.list_extractors:
3680 for ie in extractors:
3682 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3683 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3684 for mu in matchedUrls:
3688 # Conflicting, missing and erroneous options
3689 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3690 parser.error(u'using .netrc conflicts with giving username/password')
3691 if opts.password is not None and opts.username is None:
3692 parser.error(u'account username missing')
3693 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3694 parser.error(u'using output template conflicts with using title, literal title or auto number')
3695 if opts.usetitle and opts.useliteral:
3696 parser.error(u'using title conflicts with using literal title')
3697 if opts.username is not None and opts.password is None:
3698 opts.password = getpass.getpass(u'Type account password and press return:')
3699 if opts.ratelimit is not None:
3700 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3701 if numeric_limit is None:
3702 parser.error(u'invalid rate limit specified')
3703 opts.ratelimit = numeric_limit
3704 if opts.retries is not None:
3706 opts.retries = long(opts.retries)
3707 except (TypeError, ValueError), err:
3708 parser.error(u'invalid retry count specified')
3710 opts.playliststart = int(opts.playliststart)
3711 if opts.playliststart <= 0:
3712 raise ValueError(u'Playlist start must be positive')
3713 except (TypeError, ValueError), err:
3714 parser.error(u'invalid playlist start number specified')
3716 opts.playlistend = int(opts.playlistend)
3717 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3718 raise ValueError(u'Playlist end must be greater than playlist start')
3719 except (TypeError, ValueError), err:
3720 parser.error(u'invalid playlist end number specified')
3721 if opts.extractaudio:
3722 if opts.audioformat not in ['best', 'aac', 'mp3']:
3723 parser.error(u'invalid audio format specified')
3726 fd = FileDownloader({
3727 'usenetrc': opts.usenetrc,
3728 'username': opts.username,
3729 'password': opts.password,
3730 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3731 'forceurl': opts.geturl,
3732 'forcetitle': opts.gettitle,
3733 'forcethumbnail': opts.getthumbnail,
3734 'forcedescription': opts.getdescription,
3735 'forcefilename': opts.getfilename,
3736 'forceformat': opts.getformat,
3737 'simulate': opts.simulate,
3738 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3739 'format': opts.format,
3740 'format_limit': opts.format_limit,
3741 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3742 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3743 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3744 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3745 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3746 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3747 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3748 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3749 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3750 or u'%(id)s.%(ext)s'),
3751 'ignoreerrors': opts.ignoreerrors,
3752 'ratelimit': opts.ratelimit,
3753 'nooverwrites': opts.nooverwrites,
3754 'retries': opts.retries,
3755 'continuedl': opts.continue_dl,
3756 'noprogress': opts.noprogress,
3757 'playliststart': opts.playliststart,
3758 'playlistend': opts.playlistend,
3759 'logtostderr': opts.outtmpl == '-',
3760 'consoletitle': opts.consoletitle,
3761 'nopart': opts.nopart,
3762 'updatetime': opts.updatetime,
3763 'writedescription': opts.writedescription,
3764 'writeinfojson': opts.writeinfojson,
3765 'matchtitle': opts.matchtitle,
3766 'rejecttitle': opts.rejecttitle,
3768 for extractor in extractors:
3769 fd.add_info_extractor(extractor)
3772 if opts.extractaudio:
3773 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3776 if opts.update_self:
3777 updateSelf(fd, sys.argv[0])
3780 if len(all_urls) < 1:
3781 if not opts.update_self:
3782 parser.error(u'you must provide at least one URL')
3785 retcode = fd.download(all_urls)
3787 # Dump cookie jar if requested
3788 if opts.cookiefile is not None:
3791 except (IOError, OSError), err:
3792 sys.exit(u'ERROR: unable to save cookie jar')
3797 if __name__ == '__main__':
3800 except DownloadError:
3802 except SameFileError:
3803 sys.exit(u'ERROR: fixed output name but more than one file to download')
3804 except KeyboardInterrupt:
3805 sys.exit(u'\nERROR: Interrupted by user')
3807 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: