2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2011.12.08'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
52 except ImportError: # Python 2.4
55 import cStringIO as StringIO
59 # parse_qs was moved from the cgi module to the urlparse module recently.
61 from urlparse import parse_qs
63 from cgi import parse_qs
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
98 raiseError('Premature end', i)
100 def decodeEscape(match):
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
128 while s[e-bslashes-1] == '\\':
130 if bslashes % 2 == 1:
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
141 if s[i] == '}': # Empty dictionary
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
156 raiseError('Expected comma or closing curly brace', i)
161 if s[i] == ']': # Empty array
166 i = skipSpace(i) # Raise exception if premature end
170 raiseError('Expected a comma or closing bracket', i)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
176 raiseError('Not a boolean (or null)', i)
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
180 raiseError('Not a number', i)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
196 def preferredencoding():
197 """Get preferred encoding.
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
202 def yield_preferredencoding():
204 pref = locale.getpreferredencoding()
210 return yield_preferredencoding().next()
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
219 entity = matchobj.group(1)
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
231 numstr = u'0%s' % numstr
234 return unichr(long(numstr, base))
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
254 It returns the tuple (stream, definitive_file_name).
258 if sys.platform == 'win32':
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(filename, open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
268 # An exception here should be caught in the caller
269 stream = open(filename, open_mode)
270 return (stream, filename)
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
285 def _orderedSet(iterable):
286 """ Remove all duplicates from the input iterable """
293 class DownloadError(Exception):
294 """Download Error exception.
296 This exception may be thrown by FileDownloader objects if they are not
297 configured to continue on errors. They will contain the appropriate
303 class SameFileError(Exception):
304 """Same File exception.
306 This exception will be thrown by FileDownloader objects if they detect
307 multiple files would have to be downloaded to the same file on disk.
312 class PostProcessingError(Exception):
313 """Post Processing exception.
315 This exception may be raised by PostProcessor's .run() method to
316 indicate an error in the postprocessing task.
320 class MaxDownloadsReached(Exception):
321 """ --max-downloads limit has been reached. """
325 class UnavailableVideoError(Exception):
326 """Unavailable Format exception.
328 This exception will be thrown when a video is requested
329 in a format that is not available for that video.
334 class ContentTooShortError(Exception):
335 """Content Too Short exception.
337 This exception may be raised by FileDownloader objects when a file they
338 download is too small for what the server announced first, indicating
339 the connection was probably interrupted.
345 def __init__(self, downloaded, expected):
346 self.downloaded = downloaded
347 self.expected = expected
350 class YoutubeDLHandler(urllib2.HTTPHandler):
351 """Handler for HTTP requests and responses.
353 This class, when installed with an OpenerDirector, automatically adds
354 the standard headers to every HTTP request and handles gzipped and
355 deflated responses from web servers. If compression is to be avoided in
356 a particular request, the original request in the program code only has
357 to include the HTTP header "Youtubedl-No-Compression", which will be
358 removed before making the real request.
360 Part of this code was copied from:
362 http://techknack.net/python-urllib2-handlers/
364 Andrew Rowls, the author of that code, agreed to release it to the
371 return zlib.decompress(data, -zlib.MAX_WBITS)
373 return zlib.decompress(data)
376 def addinfourl_wrapper(stream, headers, url, code):
377 if hasattr(urllib2.addinfourl, 'getcode'):
378 return urllib2.addinfourl(stream, headers, url, code)
379 ret = urllib2.addinfourl(stream, headers, url)
383 def http_request(self, req):
384 for h in std_headers:
387 req.add_header(h, std_headers[h])
388 if 'Youtubedl-no-compression' in req.headers:
389 if 'Accept-encoding' in req.headers:
390 del req.headers['Accept-encoding']
391 del req.headers['Youtubedl-no-compression']
394 def http_response(self, req, resp):
397 if resp.headers.get('Content-encoding', '') == 'gzip':
398 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
399 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
400 resp.msg = old_resp.msg
402 if resp.headers.get('Content-encoding', '') == 'deflate':
403 gz = StringIO.StringIO(self.deflate(resp.read()))
404 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
405 resp.msg = old_resp.msg
409 class FileDownloader(object):
410 """File Downloader class.
412 File downloader objects are the ones responsible of downloading the
413 actual video file and writing it to disk if the user has requested
414 it, among some other tasks. In most cases there should be one per
415 program. As, given a video URL, the downloader doesn't know how to
416 extract all the needed information, task that InfoExtractors do, it
417 has to pass the URL to one of them.
419 For this, file downloader objects have a method that allows
420 InfoExtractors to be registered in a given order. When it is passed
421 a URL, the file downloader handles it to the first InfoExtractor it
422 finds that reports being able to handle it. The InfoExtractor extracts
423 all the information about the video or videos the URL refers to, and
424 asks the FileDownloader to process the video information, possibly
425 downloading the video.
427 File downloaders accept a lot of parameters. In order not to saturate
428 the object constructor with arguments, it receives a dictionary of
429 options instead. These options are available through the params
430 attribute for the InfoExtractors to use. The FileDownloader also
431 registers itself as the downloader in charge for the InfoExtractors
432 that are added to it, so this is a "mutual registration".
436 username: Username for authentication purposes.
437 password: Password for authentication purposes.
438 usenetrc: Use netrc for authentication instead.
439 quiet: Do not print messages to stdout.
440 forceurl: Force printing final URL.
441 forcetitle: Force printing title.
442 forcethumbnail: Force printing thumbnail URL.
443 forcedescription: Force printing description.
444 forcefilename: Force printing final filename.
445 simulate: Do not download the video files.
446 format: Video format code.
447 format_limit: Highest quality format to try.
448 outtmpl: Template for output names.
449 ignoreerrors: Do not stop on download errors.
450 ratelimit: Download speed limit, in bytes/sec.
451 nooverwrites: Prevent overwriting files.
452 retries: Number of times to retry for HTTP error 5xx
453 continuedl: Try to continue downloads if possible.
454 noprogress: Do not print the progress bar.
455 playliststart: Playlist item to start at.
456 playlistend: Playlist item to end at.
457 matchtitle: Download only matching titles.
458 rejecttitle: Reject downloads for matching titles.
459 logtostderr: Log messages to stderr instead of stdout.
460 consoletitle: Display progress in console window's titlebar.
461 nopart: Do not use temporary .part files.
462 updatetime: Use the Last-modified header to set output file timestamps.
463 writedescription: Write the video description to a .description file
464 writeinfojson: Write the video description to a .info.json file
470 _download_retcode = None
471 _num_downloads = None
474 def __init__(self, params):
475 """Create a FileDownloader object with the given options."""
478 self._download_retcode = 0
479 self._num_downloads = 0
480 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
484 def format_bytes(bytes):
487 if type(bytes) is str:
492 exponent = long(math.log(bytes, 1024.0))
493 suffix = 'bkMGTPEZY'[exponent]
494 converted = float(bytes) / float(1024 ** exponent)
495 return '%.2f%s' % (converted, suffix)
498 def calc_percent(byte_counter, data_len):
501 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
504 def calc_eta(start, now, total, current):
508 if current == 0 or dif < 0.001: # One millisecond
510 rate = float(current) / dif
511 eta = long((float(total) - float(current)) / rate)
512 (eta_mins, eta_secs) = divmod(eta, 60)
515 return '%02d:%02d' % (eta_mins, eta_secs)
518 def calc_speed(start, now, bytes):
520 if bytes == 0 or dif < 0.001: # One millisecond
521 return '%10s' % '---b/s'
522 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
525 def best_block_size(elapsed_time, bytes):
526 new_min = max(bytes / 2.0, 1.0)
527 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
528 if elapsed_time < 0.001:
530 rate = bytes / elapsed_time
538 def parse_bytes(bytestr):
539 """Parse a string indicating a byte quantity into a long integer."""
540 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
543 number = float(matchobj.group(1))
544 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
545 return long(round(number * multiplier))
547 def add_info_extractor(self, ie):
548 """Add an InfoExtractor object to the end of the list."""
550 ie.set_downloader(self)
552 def add_post_processor(self, pp):
553 """Add a PostProcessor object to the end of the chain."""
555 pp.set_downloader(self)
557 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
558 """Print message to stdout if not in quiet mode."""
560 if not self.params.get('quiet', False):
561 terminator = [u'\n', u''][skip_eol]
562 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
563 self._screen_file.flush()
564 except (UnicodeEncodeError), err:
565 if not ignore_encoding_errors:
568 def to_stderr(self, message):
569 """Print message to stderr."""
570 print >>sys.stderr, message.encode(preferredencoding())
572 def to_cons_title(self, message):
573 """Set console/terminal window title to message."""
574 if not self.params.get('consoletitle', False):
576 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
577 # c_wchar_p() might not be necessary if `message` is
578 # already of type unicode()
579 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
580 elif 'TERM' in os.environ:
581 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
583 def fixed_template(self):
584 """Checks if the output template is fixed."""
585 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
587 def trouble(self, message=None):
588 """Determine action to take when a download problem appears.
590 Depending on if the downloader has been configured to ignore
591 download errors or not, this method may throw an exception or
592 not when errors are found, after printing the message.
594 if message is not None:
595 self.to_stderr(message)
596 if not self.params.get('ignoreerrors', False):
597 raise DownloadError(message)
598 self._download_retcode = 1
600 def slow_down(self, start_time, byte_counter):
601 """Sleep if the download speed is over the rate limit."""
602 rate_limit = self.params.get('ratelimit', None)
603 if rate_limit is None or byte_counter == 0:
606 elapsed = now - start_time
609 speed = float(byte_counter) / elapsed
610 if speed > rate_limit:
611 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
613 def temp_name(self, filename):
614 """Returns a temporary filename for the given filename."""
615 if self.params.get('nopart', False) or filename == u'-' or \
616 (os.path.exists(filename) and not os.path.isfile(filename)):
618 return filename + u'.part'
620 def undo_temp_name(self, filename):
621 if filename.endswith(u'.part'):
622 return filename[:-len(u'.part')]
625 def try_rename(self, old_filename, new_filename):
627 if old_filename == new_filename:
629 os.rename(old_filename, new_filename)
630 except (IOError, OSError), err:
631 self.trouble(u'ERROR: unable to rename file')
633 def try_utime(self, filename, last_modified_hdr):
634 """Try to set the last-modified time of the given file."""
635 if last_modified_hdr is None:
637 if not os.path.isfile(filename):
639 timestr = last_modified_hdr
642 filetime = timeconvert(timestr)
646 os.utime(filename, (time.time(), filetime))
651 def report_writedescription(self, descfn):
652 """ Report that the description file is being written """
653 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
655 def report_writeinfojson(self, infofn):
656 """ Report that the metadata file has been written """
657 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
659 def report_destination(self, filename):
660 """Report destination filename."""
661 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
663 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
664 """Report download progress."""
665 if self.params.get('noprogress', False):
667 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
668 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
669 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
670 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
672 def report_resuming_byte(self, resume_len):
673 """Report attempt to resume at given byte."""
674 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
676 def report_retry(self, count, retries):
677 """Report retry in case of HTTP error 5xx"""
678 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
680 def report_file_already_downloaded(self, file_name):
681 """Report file has already been fully downloaded."""
683 self.to_screen(u'[download] %s has already been downloaded' % file_name)
684 except (UnicodeEncodeError), err:
685 self.to_screen(u'[download] The file has already been downloaded')
687 def report_unable_to_resume(self):
688 """Report it was impossible to resume download."""
689 self.to_screen(u'[download] Unable to resume')
691 def report_finish(self):
692 """Report download finished."""
693 if self.params.get('noprogress', False):
694 self.to_screen(u'[download] Download completed')
698 def increment_downloads(self):
699 """Increment the ordinal that assigns a number to each file."""
700 self._num_downloads += 1
702 def prepare_filename(self, info_dict):
703 """Generate the output filename."""
705 template_dict = dict(info_dict)
706 template_dict['epoch'] = unicode(long(time.time()))
707 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
708 filename = self.params['outtmpl'] % template_dict
710 except (ValueError, KeyError), err:
711 self.trouble(u'ERROR: invalid system charset or erroneous output template')
714 def _match_entry(self, info_dict):
715 """ Returns None iff the file should be downloaded """
717 title = info_dict['title']
718 matchtitle = self.params.get('matchtitle', False)
719 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
720 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
721 rejecttitle = self.params.get('rejecttitle', False)
722 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
723 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
726 def process_info(self, info_dict):
727 """Process a single dictionary returned by an InfoExtractor."""
729 reason = self._match_entry(info_dict)
730 if reason is not None:
731 self.to_screen(u'[download] ' + reason)
734 max_downloads = self.params.get('max_downloads')
735 if max_downloads is not None:
736 if self._num_downloads > int(max_downloads):
737 raise MaxDownloadsReached()
739 filename = self.prepare_filename(info_dict)
742 if self.params.get('forcetitle', False):
743 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
744 if self.params.get('forceurl', False):
745 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
746 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
747 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
748 if self.params.get('forcedescription', False) and 'description' in info_dict:
749 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
750 if self.params.get('forcefilename', False) and filename is not None:
751 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
752 if self.params.get('forceformat', False):
753 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
755 # Do nothing else if in simulate mode
756 if self.params.get('simulate', False):
762 if self.params.get('nooverwrites', False) and os.path.exists(filename):
763 self.to_stderr(u'WARNING: file exists and will be skipped')
767 dn = os.path.dirname(filename)
768 if dn != '' and not os.path.exists(dn):
770 except (OSError, IOError), err:
771 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
774 if self.params.get('writedescription', False):
776 descfn = filename + '.description'
777 self.report_writedescription(descfn)
778 descfile = open(descfn, 'wb')
780 descfile.write(info_dict['description'].encode('utf-8'))
783 except (OSError, IOError):
784 self.trouble(u'ERROR: Cannot write description file ' + descfn)
787 if self.params.get('writeinfojson', False):
788 infofn = filename + '.info.json'
789 self.report_writeinfojson(infofn)
792 except (NameError,AttributeError):
793 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
796 infof = open(infofn, 'wb')
798 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
799 json.dump(json_info_dict, infof)
802 except (OSError, IOError):
803 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
806 if not self.params.get('skip_download', False):
808 success = self._do_download(filename, info_dict)
809 except (OSError, IOError), err:
810 raise UnavailableVideoError
811 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
812 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
814 except (ContentTooShortError, ), err:
815 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
820 self.post_process(filename, info_dict)
821 except (PostProcessingError), err:
822 self.trouble(u'ERROR: postprocessing: %s' % str(err))
825 def download(self, url_list):
826 """Download a given list of URLs."""
827 if len(url_list) > 1 and self.fixed_template():
828 raise SameFileError(self.params['outtmpl'])
831 suitable_found = False
833 # Go to next InfoExtractor if not suitable
834 if not ie.suitable(url):
837 # Suitable InfoExtractor found
838 suitable_found = True
840 # Extract information from URL and process it
843 # Suitable InfoExtractor had been found; go to next URL
846 if not suitable_found:
847 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
849 return self._download_retcode
851 def post_process(self, filename, ie_info):
852 """Run the postprocessing chain on the given file."""
854 info['filepath'] = filename
860 def _download_with_rtmpdump(self, filename, url, player_url):
861 self.report_destination(filename)
862 tmpfilename = self.temp_name(filename)
864 # Check for rtmpdump first
866 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
867 except (OSError, IOError):
868 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
871 # Download using rtmpdump. rtmpdump returns exit code 2 when
872 # the connection was interrumpted and resuming appears to be
873 # possible. This is part of rtmpdump's normal usage, AFAIK.
874 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
875 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
876 while retval == 2 or retval == 1:
877 prevsize = os.path.getsize(tmpfilename)
878 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
879 time.sleep(5.0) # This seems to be needed
880 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
881 cursize = os.path.getsize(tmpfilename)
882 if prevsize == cursize and retval == 1:
884 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
885 if prevsize == cursize and retval == 2 and cursize > 1024:
886 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
890 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
891 self.try_rename(tmpfilename, filename)
894 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
897 def _do_download(self, filename, info_dict):
898 url = info_dict['url']
899 player_url = info_dict.get('player_url', None)
901 # Check file already present
902 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
903 self.report_file_already_downloaded(filename)
906 # Attempt to download using rtmpdump
907 if url.startswith('rtmp'):
908 return self._download_with_rtmpdump(filename, url, player_url)
910 tmpfilename = self.temp_name(filename)
913 # Do not include the Accept-Encoding header
914 headers = {'Youtubedl-no-compression': 'True'}
915 basic_request = urllib2.Request(url, None, headers)
916 request = urllib2.Request(url, None, headers)
918 # Establish possible resume length
919 if os.path.isfile(tmpfilename):
920 resume_len = os.path.getsize(tmpfilename)
926 if self.params.get('continuedl', False):
927 self.report_resuming_byte(resume_len)
928 request.add_header('Range','bytes=%d-' % resume_len)
934 retries = self.params.get('retries', 0)
935 while count <= retries:
936 # Establish connection
938 if count == 0 and 'urlhandle' in info_dict:
939 data = info_dict['urlhandle']
940 data = urllib2.urlopen(request)
942 except (urllib2.HTTPError, ), err:
943 if (err.code < 500 or err.code >= 600) and err.code != 416:
944 # Unexpected HTTP error
946 elif err.code == 416:
947 # Unable to resume (requested range not satisfiable)
949 # Open the connection again without the range header
950 data = urllib2.urlopen(basic_request)
951 content_length = data.info()['Content-Length']
952 except (urllib2.HTTPError, ), err:
953 if err.code < 500 or err.code >= 600:
956 # Examine the reported length
957 if (content_length is not None and
958 (resume_len - 100 < long(content_length) < resume_len + 100)):
959 # The file had already been fully downloaded.
960 # Explanation to the above condition: in issue #175 it was revealed that
961 # YouTube sometimes adds or removes a few bytes from the end of the file,
962 # changing the file size slightly and causing problems for some users. So
963 # I decided to implement a suggested change and consider the file
964 # completely downloaded if the file size differs less than 100 bytes from
965 # the one in the hard drive.
966 self.report_file_already_downloaded(filename)
967 self.try_rename(tmpfilename, filename)
970 # The length does not match, we start the download over
971 self.report_unable_to_resume()
977 self.report_retry(count, retries)
980 self.trouble(u'ERROR: giving up after %s retries' % retries)
983 data_len = data.info().get('Content-length', None)
984 if data_len is not None:
985 data_len = long(data_len) + resume_len
986 data_len_str = self.format_bytes(data_len)
987 byte_counter = 0 + resume_len
993 data_block = data.read(block_size)
995 if len(data_block) == 0:
997 byte_counter += len(data_block)
999 # Open file just in time
1002 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1003 assert stream is not None
1004 filename = self.undo_temp_name(tmpfilename)
1005 self.report_destination(filename)
1006 except (OSError, IOError), err:
1007 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1010 stream.write(data_block)
1011 except (IOError, OSError), err:
1012 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1014 block_size = self.best_block_size(after - before, len(data_block))
1017 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1018 if data_len is None:
1019 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1021 percent_str = self.calc_percent(byte_counter, data_len)
1022 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1023 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1026 self.slow_down(start, byte_counter - resume_len)
1029 self.trouble(u'\nERROR: Did not get any data blocks')
1032 self.report_finish()
1033 if data_len is not None and byte_counter != data_len:
1034 raise ContentTooShortError(byte_counter, long(data_len))
1035 self.try_rename(tmpfilename, filename)
1037 # Update file modification time
1038 if self.params.get('updatetime', True):
1039 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1044 class InfoExtractor(object):
1045 """Information Extractor class.
1047 Information extractors are the classes that, given a URL, extract
1048 information from the video (or videos) the URL refers to. This
1049 information includes the real video URL, the video title and simplified
1050 title, author and others. The information is stored in a dictionary
1051 which is then passed to the FileDownloader. The FileDownloader
1052 processes this information possibly downloading the video to the file
1053 system, among other possible outcomes. The dictionaries must include
1054 the following fields:
1056 id: Video identifier.
1057 url: Final video URL.
1058 uploader: Nickname of the video uploader.
1059 title: Literal title.
1060 stitle: Simplified title.
1061 ext: Video filename extension.
1062 format: Video format.
1063 player_url: SWF Player URL (may be None).
1065 The following fields are optional. Their primary purpose is to allow
1066 youtube-dl to serve as the backend for a video search function, such
1067 as the one in youtube2mp3. They are only used when their respective
1068 forced printing functions are called:
1070 thumbnail: Full URL to a video thumbnail image.
1071 description: One-line video description.
1073 Subclasses of this one should re-define the _real_initialize() and
1074 _real_extract() methods and define a _VALID_URL regexp.
1075 Probably, they should also be added to the list of extractors.
1081 def __init__(self, downloader=None):
1082 """Constructor. Receives an optional downloader."""
1084 self.set_downloader(downloader)
1086 def suitable(self, url):
1087 """Receives a URL and returns True if suitable for this IE."""
1088 return re.match(self._VALID_URL, url) is not None
1090 def initialize(self):
1091 """Initializes an instance (authentication, etc)."""
1093 self._real_initialize()
1096 def extract(self, url):
1097 """Extracts URL information and returns it in list of dicts."""
1099 return self._real_extract(url)
1101 def set_downloader(self, downloader):
1102 """Sets the downloader for this IE."""
1103 self._downloader = downloader
1105 def _real_initialize(self):
1106 """Real initialization process. Redefine in subclasses."""
1109 def _real_extract(self, url):
1110 """Real extraction process. Redefine in subclasses."""
1114 class YoutubeIE(InfoExtractor):
1115 """Information extractor for youtube.com."""
1117 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1118 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1119 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1120 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1121 _NETRC_MACHINE = 'youtube'
1122 # Listed in order of quality
1123 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1124 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1125 _video_extensions = {
1131 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1136 _video_dimensions = {
1151 IE_NAME = u'youtube'
1153 def report_lang(self):
1154 """Report attempt to set language."""
1155 self._downloader.to_screen(u'[youtube] Setting language')
1157 def report_login(self):
1158 """Report attempt to log in."""
1159 self._downloader.to_screen(u'[youtube] Logging in')
1161 def report_age_confirmation(self):
1162 """Report attempt to confirm age."""
1163 self._downloader.to_screen(u'[youtube] Confirming age')
1165 def report_video_webpage_download(self, video_id):
1166 """Report attempt to download video webpage."""
1167 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1169 def report_video_info_webpage_download(self, video_id):
1170 """Report attempt to download video info webpage."""
1171 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1173 def report_information_extraction(self, video_id):
1174 """Report attempt to extract video information."""
1175 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1177 def report_unavailable_format(self, video_id, format):
1178 """Report extracted video URL."""
1179 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1181 def report_rtmp_download(self):
1182 """Indicate the download will use the RTMP protocol."""
1183 self._downloader.to_screen(u'[youtube] RTMP download detected')
1185 def _print_formats(self, formats):
1186 print 'Available formats:'
1188 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1190 def _real_initialize(self):
1191 if self._downloader is None:
1196 downloader_params = self._downloader.params
1198 # Attempt to use provided username and password or .netrc data
1199 if downloader_params.get('username', None) is not None:
1200 username = downloader_params['username']
1201 password = downloader_params['password']
1202 elif downloader_params.get('usenetrc', False):
1204 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1205 if info is not None:
1209 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1210 except (IOError, netrc.NetrcParseError), err:
1211 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1215 request = urllib2.Request(self._LANG_URL)
1218 urllib2.urlopen(request).read()
1219 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1223 # No authentication to be performed
1224 if username is None:
1229 'current_form': 'loginForm',
1231 'action_login': 'Log In',
1232 'username': username,
1233 'password': password,
1235 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1238 login_results = urllib2.urlopen(request).read()
1239 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1240 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1242 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1249 'action_confirm': 'Confirm',
1251 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1253 self.report_age_confirmation()
1254 age_results = urllib2.urlopen(request).read()
1255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1259 def _real_extract(self, url):
1260 # Extract video id from URL
1261 mobj = re.match(self._VALID_URL, url)
1263 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1265 video_id = mobj.group(2)
1268 self.report_video_webpage_download(video_id)
1269 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1271 video_webpage = urllib2.urlopen(request).read()
1272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1273 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1276 # Attempt to extract SWF player URL
1277 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1278 if mobj is not None:
1279 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1284 self.report_video_info_webpage_download(video_id)
1285 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1286 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1287 % (video_id, el_type))
1288 request = urllib2.Request(video_info_url)
1290 video_info_webpage = urllib2.urlopen(request).read()
1291 video_info = parse_qs(video_info_webpage)
1292 if 'token' in video_info:
1294 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1295 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1297 if 'token' not in video_info:
1298 if 'reason' in video_info:
1299 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1301 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1304 # Start extracting information
1305 self.report_information_extraction(video_id)
1308 if 'author' not in video_info:
1309 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1311 video_uploader = urllib.unquote_plus(video_info['author'][0])
1314 if 'title' not in video_info:
1315 self._downloader.trouble(u'ERROR: unable to extract video title')
1317 video_title = urllib.unquote_plus(video_info['title'][0])
1318 video_title = video_title.decode('utf-8')
1319 video_title = sanitize_title(video_title)
1322 simple_title = _simplify_title(video_title)
1325 if 'thumbnail_url' not in video_info:
1326 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1327 video_thumbnail = ''
1328 else: # don't panic if we can't find it
1329 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1333 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1334 if mobj is not None:
1335 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1336 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1337 for expression in format_expressions:
1339 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1347 video_description = u'No description available.'
1348 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1349 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1350 if mobj is not None:
1351 video_description = mobj.group(1).decode('utf-8')
1353 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1354 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1355 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1356 # TODO use another parser
1359 video_token = urllib.unquote_plus(video_info['token'][0])
1361 # Decide which formats to download
1362 req_format = self._downloader.params.get('format', None)
1364 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1365 self.report_rtmp_download()
1366 video_url_list = [(None, video_info['conn'][0])]
1367 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1368 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1369 url_data = [parse_qs(uds) for uds in url_data_strs]
1370 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1371 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1373 format_limit = self._downloader.params.get('format_limit', None)
1374 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1375 if format_limit is not None and format_limit in available_formats:
1376 format_list = available_formats[available_formats.index(format_limit):]
1378 format_list = available_formats
1379 existing_formats = [x for x in format_list if x in url_map]
1380 if len(existing_formats) == 0:
1381 self._downloader.trouble(u'ERROR: no known formats available for video')
1383 if self._downloader.params.get('listformats', None):
1384 self._print_formats(existing_formats)
1386 if req_format is None or req_format == 'best':
1387 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1388 elif req_format == 'worst':
1389 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1390 elif req_format in ('-1', 'all'):
1391 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1393 # Specific formats. We pick the first in a slash-delimeted sequence.
1394 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1395 req_formats = req_format.split('/')
1396 video_url_list = None
1397 for rf in req_formats:
1399 video_url_list = [(rf, url_map[rf])]
1401 if video_url_list is None:
1402 self._downloader.trouble(u'ERROR: requested format not available')
1405 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1408 for format_param, video_real_url in video_url_list:
1409 # At this point we have a new video
1410 self._downloader.increment_downloads()
1413 video_extension = self._video_extensions.get(format_param, 'flv')
1416 # Process video information
1417 self._downloader.process_info({
1418 'id': video_id.decode('utf-8'),
1419 'url': video_real_url.decode('utf-8'),
1420 'uploader': video_uploader.decode('utf-8'),
1421 'upload_date': upload_date,
1422 'title': video_title,
1423 'stitle': simple_title,
1424 'ext': video_extension.decode('utf-8'),
1425 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1426 'thumbnail': video_thumbnail.decode('utf-8'),
1427 'description': video_description,
1428 'player_url': player_url,
1430 except UnavailableVideoError, err:
1431 self._downloader.trouble(u'\nERROR: unable to download video')
1434 class MetacafeIE(InfoExtractor):
1435 """Information Extractor for metacafe.com."""
1437 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1438 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1439 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1441 IE_NAME = u'metacafe'
1443 def __init__(self, youtube_ie, downloader=None):
1444 InfoExtractor.__init__(self, downloader)
1445 self._youtube_ie = youtube_ie
1447 def report_disclaimer(self):
1448 """Report disclaimer retrieval."""
1449 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1451 def report_age_confirmation(self):
1452 """Report attempt to confirm age."""
1453 self._downloader.to_screen(u'[metacafe] Confirming age')
1455 def report_download_webpage(self, video_id):
1456 """Report webpage download."""
1457 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1459 def report_extraction(self, video_id):
1460 """Report information extraction."""
1461 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1463 def _real_initialize(self):
1464 # Retrieve disclaimer
1465 request = urllib2.Request(self._DISCLAIMER)
1467 self.report_disclaimer()
1468 disclaimer = urllib2.urlopen(request).read()
1469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1470 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1476 'submit': "Continue - I'm over 18",
1478 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1480 self.report_age_confirmation()
1481 disclaimer = urllib2.urlopen(request).read()
1482 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1483 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1486 def _real_extract(self, url):
1487 # Extract id and simplified title from URL
1488 mobj = re.match(self._VALID_URL, url)
1490 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1493 video_id = mobj.group(1)
1495 # Check if video comes from YouTube
1496 mobj2 = re.match(r'^yt-(.*)$', video_id)
1497 if mobj2 is not None:
1498 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1501 # At this point we have a new video
1502 self._downloader.increment_downloads()
1504 simple_title = mobj.group(2).decode('utf-8')
1506 # Retrieve video webpage to extract further information
1507 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1509 self.report_download_webpage(video_id)
1510 webpage = urllib2.urlopen(request).read()
1511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1512 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1515 # Extract URL, uploader and title from webpage
1516 self.report_extraction(video_id)
1517 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1518 if mobj is not None:
1519 mediaURL = urllib.unquote(mobj.group(1))
1520 video_extension = mediaURL[-3:]
1522 # Extract gdaKey if available
1523 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1525 video_url = mediaURL
1527 gdaKey = mobj.group(1)
1528 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1530 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1532 self._downloader.trouble(u'ERROR: unable to extract media URL')
1534 vardict = parse_qs(mobj.group(1))
1535 if 'mediaData' not in vardict:
1536 self._downloader.trouble(u'ERROR: unable to extract media URL')
1538 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1540 self._downloader.trouble(u'ERROR: unable to extract media URL')
1542 mediaURL = mobj.group(1).replace('\\/', '/')
1543 video_extension = mediaURL[-3:]
1544 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1546 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1548 self._downloader.trouble(u'ERROR: unable to extract title')
1550 video_title = mobj.group(1).decode('utf-8')
1551 video_title = sanitize_title(video_title)
1553 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1555 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1557 video_uploader = mobj.group(1)
1560 # Process video information
1561 self._downloader.process_info({
1562 'id': video_id.decode('utf-8'),
1563 'url': video_url.decode('utf-8'),
1564 'uploader': video_uploader.decode('utf-8'),
1565 'upload_date': u'NA',
1566 'title': video_title,
1567 'stitle': simple_title,
1568 'ext': video_extension.decode('utf-8'),
1572 except UnavailableVideoError:
1573 self._downloader.trouble(u'\nERROR: unable to download video')
1576 class DailymotionIE(InfoExtractor):
1577 """Information Extractor for Dailymotion"""
1579 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1580 IE_NAME = u'dailymotion'
1582 def __init__(self, downloader=None):
1583 InfoExtractor.__init__(self, downloader)
1585 def report_download_webpage(self, video_id):
1586 """Report webpage download."""
1587 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1589 def report_extraction(self, video_id):
1590 """Report information extraction."""
1591 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1593 def _real_extract(self, url):
1594 htmlParser = HTMLParser.HTMLParser()
1596 # Extract id and simplified title from URL
1597 mobj = re.match(self._VALID_URL, url)
1599 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1602 # At this point we have a new video
1603 self._downloader.increment_downloads()
1604 video_id = mobj.group(1)
1606 video_extension = 'flv'
1608 # Retrieve video webpage to extract further information
1609 request = urllib2.Request(url)
1610 request.add_header('Cookie', 'family_filter=off')
1612 self.report_download_webpage(video_id)
1613 webpage = urllib2.urlopen(request).read()
1614 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1615 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1618 # Extract URL, uploader and title from webpage
1619 self.report_extraction(video_id)
1620 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1622 self._downloader.trouble(u'ERROR: unable to extract media URL')
1624 sequence = urllib.unquote(mobj.group(1))
1625 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1627 self._downloader.trouble(u'ERROR: unable to extract media URL')
1629 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1631 # if needed add http://www.dailymotion.com/ if relative URL
1633 video_url = mediaURL
1635 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1637 self._downloader.trouble(u'ERROR: unable to extract title')
1639 video_title = htmlParser.unescape(mobj.group('title')).decode('utf-8')
1640 video_title = sanitize_title(video_title)
1641 simple_title = _simplify_title(video_title)
1643 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1645 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1647 video_uploader = mobj.group(1)
1650 # Process video information
1651 self._downloader.process_info({
1652 'id': video_id.decode('utf-8'),
1653 'url': video_url.decode('utf-8'),
1654 'uploader': video_uploader.decode('utf-8'),
1655 'upload_date': u'NA',
1656 'title': video_title,
1657 'stitle': simple_title,
1658 'ext': video_extension.decode('utf-8'),
1662 except UnavailableVideoError:
1663 self._downloader.trouble(u'\nERROR: unable to download video')
1666 class GoogleIE(InfoExtractor):
1667 """Information extractor for video.google.com."""
1669 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1670 IE_NAME = u'video.google'
1672 def __init__(self, downloader=None):
1673 InfoExtractor.__init__(self, downloader)
1675 def report_download_webpage(self, video_id):
1676 """Report webpage download."""
1677 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1679 def report_extraction(self, video_id):
1680 """Report information extraction."""
1681 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1683 def _real_extract(self, url):
1684 # Extract id from URL
1685 mobj = re.match(self._VALID_URL, url)
1687 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1690 # At this point we have a new video
1691 self._downloader.increment_downloads()
1692 video_id = mobj.group(1)
1694 video_extension = 'mp4'
1696 # Retrieve video webpage to extract further information
1697 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1699 self.report_download_webpage(video_id)
1700 webpage = urllib2.urlopen(request).read()
1701 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1702 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1705 # Extract URL, uploader, and title from webpage
1706 self.report_extraction(video_id)
1707 mobj = re.search(r"download_url:'([^']+)'", webpage)
1709 video_extension = 'flv'
1710 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1712 self._downloader.trouble(u'ERROR: unable to extract media URL')
1714 mediaURL = urllib.unquote(mobj.group(1))
1715 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1716 mediaURL = mediaURL.replace('\\x26', '\x26')
1718 video_url = mediaURL
1720 mobj = re.search(r'<title>(.*)</title>', webpage)
1722 self._downloader.trouble(u'ERROR: unable to extract title')
1724 video_title = mobj.group(1).decode('utf-8')
1725 video_title = sanitize_title(video_title)
1726 simple_title = _simplify_title(video_title)
1728 # Extract video description
1729 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1731 self._downloader.trouble(u'ERROR: unable to extract video description')
1733 video_description = mobj.group(1).decode('utf-8')
1734 if not video_description:
1735 video_description = 'No description available.'
1737 # Extract video thumbnail
1738 if self._downloader.params.get('forcethumbnail', False):
1739 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1741 webpage = urllib2.urlopen(request).read()
1742 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1743 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1745 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1747 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1749 video_thumbnail = mobj.group(1)
1750 else: # we need something to pass to process_info
1751 video_thumbnail = ''
1754 # Process video information
1755 self._downloader.process_info({
1756 'id': video_id.decode('utf-8'),
1757 'url': video_url.decode('utf-8'),
1759 'upload_date': u'NA',
1760 'title': video_title,
1761 'stitle': simple_title,
1762 'ext': video_extension.decode('utf-8'),
1766 except UnavailableVideoError:
1767 self._downloader.trouble(u'\nERROR: unable to download video')
1770 class PhotobucketIE(InfoExtractor):
1771 """Information extractor for photobucket.com."""
1773 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1774 IE_NAME = u'photobucket'
1776 def __init__(self, downloader=None):
1777 InfoExtractor.__init__(self, downloader)
1779 def report_download_webpage(self, video_id):
1780 """Report webpage download."""
1781 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1783 def report_extraction(self, video_id):
1784 """Report information extraction."""
1785 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1787 def _real_extract(self, url):
1788 # Extract id from URL
1789 mobj = re.match(self._VALID_URL, url)
1791 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1794 # At this point we have a new video
1795 self._downloader.increment_downloads()
1796 video_id = mobj.group(1)
1798 video_extension = 'flv'
1800 # Retrieve video webpage to extract further information
1801 request = urllib2.Request(url)
1803 self.report_download_webpage(video_id)
1804 webpage = urllib2.urlopen(request).read()
1805 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1806 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1809 # Extract URL, uploader, and title from webpage
1810 self.report_extraction(video_id)
1811 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1813 self._downloader.trouble(u'ERROR: unable to extract media URL')
1815 mediaURL = urllib.unquote(mobj.group(1))
1817 video_url = mediaURL
1819 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1821 self._downloader.trouble(u'ERROR: unable to extract title')
1823 video_title = mobj.group(1).decode('utf-8')
1824 video_title = sanitize_title(video_title)
1825 simple_title = _simplify_title(vide_title)
1827 video_uploader = mobj.group(2).decode('utf-8')
1830 # Process video information
1831 self._downloader.process_info({
1832 'id': video_id.decode('utf-8'),
1833 'url': video_url.decode('utf-8'),
1834 'uploader': video_uploader,
1835 'upload_date': u'NA',
1836 'title': video_title,
1837 'stitle': simple_title,
1838 'ext': video_extension.decode('utf-8'),
1842 except UnavailableVideoError:
1843 self._downloader.trouble(u'\nERROR: unable to download video')
1846 class YahooIE(InfoExtractor):
1847 """Information extractor for video.yahoo.com."""
1849 # _VALID_URL matches all Yahoo! Video URLs
1850 # _VPAGE_URL matches only the extractable '/watch/' URLs
1851 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1852 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1853 IE_NAME = u'video.yahoo'
1855 def __init__(self, downloader=None):
1856 InfoExtractor.__init__(self, downloader)
1858 def report_download_webpage(self, video_id):
1859 """Report webpage download."""
1860 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1862 def report_extraction(self, video_id):
1863 """Report information extraction."""
1864 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1866 def _real_extract(self, url, new_video=True):
1867 # Extract ID from URL
1868 mobj = re.match(self._VALID_URL, url)
1870 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1873 # At this point we have a new video
1874 self._downloader.increment_downloads()
1875 video_id = mobj.group(2)
1876 video_extension = 'flv'
1878 # Rewrite valid but non-extractable URLs as
1879 # extractable English language /watch/ URLs
1880 if re.match(self._VPAGE_URL, url) is None:
1881 request = urllib2.Request(url)
1883 webpage = urllib2.urlopen(request).read()
1884 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1885 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1888 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1890 self._downloader.trouble(u'ERROR: Unable to extract id field')
1892 yahoo_id = mobj.group(1)
1894 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1896 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1898 yahoo_vid = mobj.group(1)
1900 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1901 return self._real_extract(url, new_video=False)
1903 # Retrieve video webpage to extract further information
1904 request = urllib2.Request(url)
1906 self.report_download_webpage(video_id)
1907 webpage = urllib2.urlopen(request).read()
1908 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1909 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1912 # Extract uploader and title from webpage
1913 self.report_extraction(video_id)
1914 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1916 self._downloader.trouble(u'ERROR: unable to extract video title')
1918 video_title = mobj.group(1).decode('utf-8')
1919 simple_title = _simplify_title(video_title)
1921 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1923 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1925 video_uploader = mobj.group(1).decode('utf-8')
1927 # Extract video thumbnail
1928 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1930 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1932 video_thumbnail = mobj.group(1).decode('utf-8')
1934 # Extract video description
1935 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1937 self._downloader.trouble(u'ERROR: unable to extract video description')
1939 video_description = mobj.group(1).decode('utf-8')
1940 if not video_description:
1941 video_description = 'No description available.'
1943 # Extract video height and width
1944 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1946 self._downloader.trouble(u'ERROR: unable to extract video height')
1948 yv_video_height = mobj.group(1)
1950 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1952 self._downloader.trouble(u'ERROR: unable to extract video width')
1954 yv_video_width = mobj.group(1)
1956 # Retrieve video playlist to extract media URL
1957 # I'm not completely sure what all these options are, but we
1958 # seem to need most of them, otherwise the server sends a 401.
1959 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1960 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1961 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1962 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1963 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1965 self.report_download_webpage(video_id)
1966 webpage = urllib2.urlopen(request).read()
1967 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1968 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1971 # Extract media URL from playlist XML
1972 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1974 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1976 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1977 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1980 # Process video information
1981 self._downloader.process_info({
1982 'id': video_id.decode('utf-8'),
1984 'uploader': video_uploader,
1985 'upload_date': u'NA',
1986 'title': video_title,
1987 'stitle': simple_title,
1988 'ext': video_extension.decode('utf-8'),
1989 'thumbnail': video_thumbnail.decode('utf-8'),
1990 'description': video_description,
1991 'thumbnail': video_thumbnail,
1994 except UnavailableVideoError:
1995 self._downloader.trouble(u'\nERROR: unable to download video')
1998 class VimeoIE(InfoExtractor):
1999 """Information extractor for vimeo.com."""
2001 # _VALID_URL matches Vimeo URLs
2002 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2005 def __init__(self, downloader=None):
2006 InfoExtractor.__init__(self, downloader)
2008 def report_download_webpage(self, video_id):
2009 """Report webpage download."""
2010 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2012 def report_extraction(self, video_id):
2013 """Report information extraction."""
2014 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2016 def _real_extract(self, url, new_video=True):
2017 # Extract ID from URL
2018 mobj = re.match(self._VALID_URL, url)
2020 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2023 # At this point we have a new video
2024 self._downloader.increment_downloads()
2025 video_id = mobj.group(1)
2027 # Retrieve video webpage to extract further information
2028 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2030 self.report_download_webpage(video_id)
2031 webpage = urllib2.urlopen(request).read()
2032 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2033 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2036 # Now we begin extracting as much information as we can from what we
2037 # retrieved. First we extract the information common to all extractors,
2038 # and latter we extract those that are Vimeo specific.
2039 self.report_extraction(video_id)
2042 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2044 self._downloader.trouble(u'ERROR: unable to extract video title')
2046 video_title = mobj.group(1).decode('utf-8')
2047 simple_title = _simplify_title(video_title)
2050 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2052 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2054 video_uploader = mobj.group(1).decode('utf-8')
2056 # Extract video thumbnail
2057 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2059 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2061 video_thumbnail = mobj.group(1).decode('utf-8')
2063 # # Extract video description
2064 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2066 # self._downloader.trouble(u'ERROR: unable to extract video description')
2068 # video_description = mobj.group(1).decode('utf-8')
2069 # if not video_description: video_description = 'No description available.'
2070 video_description = 'Foo.'
2072 # Vimeo specific: extract request signature
2073 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2075 self._downloader.trouble(u'ERROR: unable to extract request signature')
2077 sig = mobj.group(1).decode('utf-8')
2079 # Vimeo specific: extract video quality information
2080 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2082 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2084 quality = mobj.group(1).decode('utf-8')
2086 if int(quality) == 1:
2091 # Vimeo specific: Extract request signature expiration
2092 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2094 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2096 sig_exp = mobj.group(1).decode('utf-8')
2098 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2101 # Process video information
2102 self._downloader.process_info({
2103 'id': video_id.decode('utf-8'),
2105 'uploader': video_uploader,
2106 'upload_date': u'NA',
2107 'title': video_title,
2108 'stitle': simple_title,
2110 'thumbnail': video_thumbnail.decode('utf-8'),
2111 'description': video_description,
2112 'thumbnail': video_thumbnail,
2113 'description': video_description,
2116 except UnavailableVideoError:
2117 self._downloader.trouble(u'ERROR: unable to download video')
2120 class GenericIE(InfoExtractor):
2121 """Generic last-resort information extractor."""
2124 IE_NAME = u'generic'
2126 def __init__(self, downloader=None):
2127 InfoExtractor.__init__(self, downloader)
2129 def report_download_webpage(self, video_id):
2130 """Report webpage download."""
2131 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2132 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2134 def report_extraction(self, video_id):
2135 """Report information extraction."""
2136 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2138 def _real_extract(self, url):
2139 # At this point we have a new video
2140 self._downloader.increment_downloads()
2142 video_id = url.split('/')[-1]
2143 request = urllib2.Request(url)
2145 self.report_download_webpage(video_id)
2146 webpage = urllib2.urlopen(request).read()
2147 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2148 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2150 except ValueError, err:
2151 # since this is the last-resort InfoExtractor, if
2152 # this error is thrown, it'll be thrown here
2153 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2156 self.report_extraction(video_id)
2157 # Start with something easy: JW Player in SWFObject
2158 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2160 # Broaden the search a little bit
2161 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2163 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2166 # It's possible that one of the regexes
2167 # matched, but returned an empty group:
2168 if mobj.group(1) is None:
2169 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2172 video_url = urllib.unquote(mobj.group(1))
2173 video_id = os.path.basename(video_url)
2175 # here's a fun little line of code for you:
2176 video_extension = os.path.splitext(video_id)[1][1:]
2177 video_id = os.path.splitext(video_id)[0]
2179 # it's tempting to parse this further, but you would
2180 # have to take into account all the variations like
2181 # Video Title - Site Name
2182 # Site Name | Video Title
2183 # Video Title - Tagline | Site Name
2184 # and so on and so forth; it's just not practical
2185 mobj = re.search(r'<title>(.*)</title>', webpage)
2187 self._downloader.trouble(u'ERROR: unable to extract title')
2189 video_title = mobj.group(1).decode('utf-8')
2190 video_title = sanitize_title(video_title)
2191 simple_title = _simplify_title(video_title)
2193 # video uploader is domain name
2194 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2196 self._downloader.trouble(u'ERROR: unable to extract title')
2198 video_uploader = mobj.group(1).decode('utf-8')
2201 # Process video information
2202 self._downloader.process_info({
2203 'id': video_id.decode('utf-8'),
2204 'url': video_url.decode('utf-8'),
2205 'uploader': video_uploader,
2206 'upload_date': u'NA',
2207 'title': video_title,
2208 'stitle': simple_title,
2209 'ext': video_extension.decode('utf-8'),
2213 except UnavailableVideoError, err:
2214 self._downloader.trouble(u'\nERROR: unable to download video')
2217 class YoutubeSearchIE(InfoExtractor):
2218 """Information Extractor for YouTube search queries."""
2219 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2220 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2221 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2222 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2224 _max_youtube_results = 1000
2225 IE_NAME = u'youtube:search'
2227 def __init__(self, youtube_ie, downloader=None):
2228 InfoExtractor.__init__(self, downloader)
2229 self._youtube_ie = youtube_ie
2231 def report_download_page(self, query, pagenum):
2232 """Report attempt to download playlist page with given number."""
2233 query = query.decode(preferredencoding())
2234 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2236 def _real_initialize(self):
2237 self._youtube_ie.initialize()
2239 def _real_extract(self, query):
2240 mobj = re.match(self._VALID_URL, query)
2242 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2245 prefix, query = query.split(':')
2247 query = query.encode('utf-8')
2249 self._download_n_results(query, 1)
2251 elif prefix == 'all':
2252 self._download_n_results(query, self._max_youtube_results)
2258 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2260 elif n > self._max_youtube_results:
2261 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2262 n = self._max_youtube_results
2263 self._download_n_results(query, n)
2265 except ValueError: # parsing prefix as integer fails
2266 self._download_n_results(query, 1)
2269 def _download_n_results(self, query, n):
2270 """Downloads a specified number of results for a query"""
2273 already_seen = set()
2277 self.report_download_page(query, pagenum)
2278 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2279 request = urllib2.Request(result_url)
2281 page = urllib2.urlopen(request).read()
2282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2283 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2286 # Extract video identifiers
2287 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2288 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2289 if video_id not in already_seen:
2290 video_ids.append(video_id)
2291 already_seen.add(video_id)
2292 if len(video_ids) == n:
2293 # Specified n videos reached
2294 for id in video_ids:
2295 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2298 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2299 for id in video_ids:
2300 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2303 pagenum = pagenum + 1
2306 class GoogleSearchIE(InfoExtractor):
2307 """Information Extractor for Google Video search queries."""
2308 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2309 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2310 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2311 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2313 _max_google_results = 1000
2314 IE_NAME = u'video.google:search'
2316 def __init__(self, google_ie, downloader=None):
2317 InfoExtractor.__init__(self, downloader)
2318 self._google_ie = google_ie
2320 def report_download_page(self, query, pagenum):
2321 """Report attempt to download playlist page with given number."""
2322 query = query.decode(preferredencoding())
2323 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2325 def _real_initialize(self):
2326 self._google_ie.initialize()
2328 def _real_extract(self, query):
2329 mobj = re.match(self._VALID_URL, query)
2331 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2334 prefix, query = query.split(':')
2336 query = query.encode('utf-8')
2338 self._download_n_results(query, 1)
2340 elif prefix == 'all':
2341 self._download_n_results(query, self._max_google_results)
2347 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2349 elif n > self._max_google_results:
2350 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2351 n = self._max_google_results
2352 self._download_n_results(query, n)
2354 except ValueError: # parsing prefix as integer fails
2355 self._download_n_results(query, 1)
2358 def _download_n_results(self, query, n):
2359 """Downloads a specified number of results for a query"""
2362 already_seen = set()
2366 self.report_download_page(query, pagenum)
2367 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2368 request = urllib2.Request(result_url)
2370 page = urllib2.urlopen(request).read()
2371 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2372 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2375 # Extract video identifiers
2376 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2377 video_id = mobj.group(1)
2378 if video_id not in already_seen:
2379 video_ids.append(video_id)
2380 already_seen.add(video_id)
2381 if len(video_ids) == n:
2382 # Specified n videos reached
2383 for id in video_ids:
2384 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2387 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2388 for id in video_ids:
2389 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2392 pagenum = pagenum + 1
2395 class YahooSearchIE(InfoExtractor):
2396 """Information Extractor for Yahoo! Video search queries."""
2397 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2398 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2399 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2400 _MORE_PAGES_INDICATOR = r'\s*Next'
2402 _max_yahoo_results = 1000
2403 IE_NAME = u'video.yahoo:search'
2405 def __init__(self, yahoo_ie, downloader=None):
2406 InfoExtractor.__init__(self, downloader)
2407 self._yahoo_ie = yahoo_ie
2409 def report_download_page(self, query, pagenum):
2410 """Report attempt to download playlist page with given number."""
2411 query = query.decode(preferredencoding())
2412 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2414 def _real_initialize(self):
2415 self._yahoo_ie.initialize()
2417 def _real_extract(self, query):
2418 mobj = re.match(self._VALID_URL, query)
2420 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2423 prefix, query = query.split(':')
2425 query = query.encode('utf-8')
2427 self._download_n_results(query, 1)
2429 elif prefix == 'all':
2430 self._download_n_results(query, self._max_yahoo_results)
2436 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2438 elif n > self._max_yahoo_results:
2439 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2440 n = self._max_yahoo_results
2441 self._download_n_results(query, n)
2443 except ValueError: # parsing prefix as integer fails
2444 self._download_n_results(query, 1)
2447 def _download_n_results(self, query, n):
2448 """Downloads a specified number of results for a query"""
2451 already_seen = set()
2455 self.report_download_page(query, pagenum)
2456 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2457 request = urllib2.Request(result_url)
2459 page = urllib2.urlopen(request).read()
2460 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2461 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2464 # Extract video identifiers
2465 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2466 video_id = mobj.group(1)
2467 if video_id not in already_seen:
2468 video_ids.append(video_id)
2469 already_seen.add(video_id)
2470 if len(video_ids) == n:
2471 # Specified n videos reached
2472 for id in video_ids:
2473 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2476 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2477 for id in video_ids:
2478 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2481 pagenum = pagenum + 1
2484 class YoutubePlaylistIE(InfoExtractor):
2485 """Information Extractor for YouTube playlists."""
2487 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2488 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2489 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2490 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2492 IE_NAME = u'youtube:playlist'
2494 def __init__(self, youtube_ie, downloader=None):
2495 InfoExtractor.__init__(self, downloader)
2496 self._youtube_ie = youtube_ie
2498 def report_download_page(self, playlist_id, pagenum):
2499 """Report attempt to download playlist page with given number."""
2500 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2502 def _real_initialize(self):
2503 self._youtube_ie.initialize()
2505 def _real_extract(self, url):
2506 # Extract playlist id
2507 mobj = re.match(self._VALID_URL, url)
2509 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2513 if mobj.group(3) is not None:
2514 self._youtube_ie.extract(mobj.group(3))
2517 # Download playlist pages
2518 # prefix is 'p' as default for playlists but there are other types that need extra care
2519 playlist_prefix = mobj.group(1)
2520 if playlist_prefix == 'a':
2521 playlist_access = 'artist'
2523 playlist_prefix = 'p'
2524 playlist_access = 'view_play_list'
2525 playlist_id = mobj.group(2)
2530 self.report_download_page(playlist_id, pagenum)
2531 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2532 request = urllib2.Request(url)
2534 page = urllib2.urlopen(request).read()
2535 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2536 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2539 # Extract video identifiers
2541 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2542 if mobj.group(1) not in ids_in_page:
2543 ids_in_page.append(mobj.group(1))
2544 video_ids.extend(ids_in_page)
2546 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2548 pagenum = pagenum + 1
2550 playliststart = self._downloader.params.get('playliststart', 1) - 1
2551 playlistend = self._downloader.params.get('playlistend', -1)
2552 video_ids = video_ids[playliststart:playlistend]
2554 for id in video_ids:
2555 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2559 class YoutubeUserIE(InfoExtractor):
2560 """Information Extractor for YouTube users."""
2562 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2563 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2564 _GDATA_PAGE_SIZE = 50
2565 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2566 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2568 IE_NAME = u'youtube:user'
2570 def __init__(self, youtube_ie, downloader=None):
2571 InfoExtractor.__init__(self, downloader)
2572 self._youtube_ie = youtube_ie
2574 def report_download_page(self, username, start_index):
2575 """Report attempt to download user page."""
2576 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2577 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2579 def _real_initialize(self):
2580 self._youtube_ie.initialize()
2582 def _real_extract(self, url):
2584 mobj = re.match(self._VALID_URL, url)
2586 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2589 username = mobj.group(1)
2591 # Download video ids using YouTube Data API. Result size per
2592 # query is limited (currently to 50 videos) so we need to query
2593 # page by page until there are no video ids - it means we got
2600 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2601 self.report_download_page(username, start_index)
2603 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2606 page = urllib2.urlopen(request).read()
2607 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2608 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2611 # Extract video identifiers
2614 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2615 if mobj.group(1) not in ids_in_page:
2616 ids_in_page.append(mobj.group(1))
2618 video_ids.extend(ids_in_page)
2620 # A little optimization - if current page is not
2621 # "full", ie. does not contain PAGE_SIZE video ids then
2622 # we can assume that this page is the last one - there
2623 # are no more ids on further pages - no need to query
2626 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2631 all_ids_count = len(video_ids)
2632 playliststart = self._downloader.params.get('playliststart', 1) - 1
2633 playlistend = self._downloader.params.get('playlistend', -1)
2635 if playlistend == -1:
2636 video_ids = video_ids[playliststart:]
2638 video_ids = video_ids[playliststart:playlistend]
2640 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2641 (username, all_ids_count, len(video_ids)))
2643 for video_id in video_ids:
2644 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2647 class DepositFilesIE(InfoExtractor):
2648 """Information extractor for depositfiles.com"""
2650 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2651 IE_NAME = u'DepositFiles'
2653 def __init__(self, downloader=None):
2654 InfoExtractor.__init__(self, downloader)
2656 def report_download_webpage(self, file_id):
2657 """Report webpage download."""
2658 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2660 def report_extraction(self, file_id):
2661 """Report information extraction."""
2662 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2664 def _real_extract(self, url):
2665 # At this point we have a new file
2666 self._downloader.increment_downloads()
2668 file_id = url.split('/')[-1]
2669 # Rebuild url in english locale
2670 url = 'http://depositfiles.com/en/files/' + file_id
2672 # Retrieve file webpage with 'Free download' button pressed
2673 free_download_indication = { 'gateway_result' : '1' }
2674 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2676 self.report_download_webpage(file_id)
2677 webpage = urllib2.urlopen(request).read()
2678 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2679 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2682 # Search for the real file URL
2683 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2684 if (mobj is None) or (mobj.group(1) is None):
2685 # Try to figure out reason of the error.
2686 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2687 if (mobj is not None) and (mobj.group(1) is not None):
2688 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2689 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2691 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2694 file_url = mobj.group(1)
2695 file_extension = os.path.splitext(file_url)[1][1:]
2697 # Search for file title
2698 mobj = re.search(r'<b title="(.*?)">', webpage)
2700 self._downloader.trouble(u'ERROR: unable to extract title')
2702 file_title = mobj.group(1).decode('utf-8')
2705 # Process file information
2706 self._downloader.process_info({
2707 'id': file_id.decode('utf-8'),
2708 'url': file_url.decode('utf-8'),
2710 'upload_date': u'NA',
2711 'title': file_title,
2712 'stitle': file_title,
2713 'ext': file_extension.decode('utf-8'),
2717 except UnavailableVideoError, err:
2718 self._downloader.trouble(u'ERROR: unable to download file')
2721 class FacebookIE(InfoExtractor):
2722 """Information Extractor for Facebook"""
2724 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2725 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2726 _NETRC_MACHINE = 'facebook'
2727 _available_formats = ['video', 'highqual', 'lowqual']
2728 _video_extensions = {
2733 IE_NAME = u'facebook'
2735 def __init__(self, downloader=None):
2736 InfoExtractor.__init__(self, downloader)
2738 def _reporter(self, message):
2739 """Add header and report message."""
2740 self._downloader.to_screen(u'[facebook] %s' % message)
2742 def report_login(self):
2743 """Report attempt to log in."""
2744 self._reporter(u'Logging in')
2746 def report_video_webpage_download(self, video_id):
2747 """Report attempt to download video webpage."""
2748 self._reporter(u'%s: Downloading video webpage' % video_id)
2750 def report_information_extraction(self, video_id):
2751 """Report attempt to extract video information."""
2752 self._reporter(u'%s: Extracting video information' % video_id)
2754 def _parse_page(self, video_webpage):
2755 """Extract video information from page"""
2757 data = {'title': r'\("video_title", "(.*?)"\)',
2758 'description': r'<div class="datawrap">(.*?)</div>',
2759 'owner': r'\("video_owner_name", "(.*?)"\)',
2760 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2763 for piece in data.keys():
2764 mobj = re.search(data[piece], video_webpage)
2765 if mobj is not None:
2766 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2770 for fmt in self._available_formats:
2771 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2772 if mobj is not None:
2773 # URL is in a Javascript segment inside an escaped Unicode format within
2774 # the generally utf-8 page
2775 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2776 video_info['video_urls'] = video_urls
2780 def _real_initialize(self):
2781 if self._downloader is None:
2786 downloader_params = self._downloader.params
2788 # Attempt to use provided username and password or .netrc data
2789 if downloader_params.get('username', None) is not None:
2790 useremail = downloader_params['username']
2791 password = downloader_params['password']
2792 elif downloader_params.get('usenetrc', False):
2794 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2795 if info is not None:
2799 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2800 except (IOError, netrc.NetrcParseError), err:
2801 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2804 if useremail is None:
2813 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2816 login_results = urllib2.urlopen(request).read()
2817 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2818 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2820 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2821 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2824 def _real_extract(self, url):
2825 mobj = re.match(self._VALID_URL, url)
2827 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2829 video_id = mobj.group('ID')
2832 self.report_video_webpage_download(video_id)
2833 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2835 page = urllib2.urlopen(request)
2836 video_webpage = page.read()
2837 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2838 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2841 # Start extracting information
2842 self.report_information_extraction(video_id)
2844 # Extract information
2845 video_info = self._parse_page(video_webpage)
2848 if 'owner' not in video_info:
2849 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2851 video_uploader = video_info['owner']
2854 if 'title' not in video_info:
2855 self._downloader.trouble(u'ERROR: unable to extract video title')
2857 video_title = video_info['title']
2858 video_title = video_title.decode('utf-8')
2859 video_title = sanitize_title(video_title)
2861 simple_title = _simplify_title(video_title)
2864 if 'thumbnail' not in video_info:
2865 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2866 video_thumbnail = ''
2868 video_thumbnail = video_info['thumbnail']
2872 if 'upload_date' in video_info:
2873 upload_time = video_info['upload_date']
2874 timetuple = email.utils.parsedate_tz(upload_time)
2875 if timetuple is not None:
2877 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2882 video_description = video_info.get('description', 'No description available.')
2884 url_map = video_info['video_urls']
2885 if len(url_map.keys()) > 0:
2886 # Decide which formats to download
2887 req_format = self._downloader.params.get('format', None)
2888 format_limit = self._downloader.params.get('format_limit', None)
2890 if format_limit is not None and format_limit in self._available_formats:
2891 format_list = self._available_formats[self._available_formats.index(format_limit):]
2893 format_list = self._available_formats
2894 existing_formats = [x for x in format_list if x in url_map]
2895 if len(existing_formats) == 0:
2896 self._downloader.trouble(u'ERROR: no known formats available for video')
2898 if req_format is None:
2899 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2900 elif req_format == 'worst':
2901 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2902 elif req_format == '-1':
2903 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2906 if req_format not in url_map:
2907 self._downloader.trouble(u'ERROR: requested format not available')
2909 video_url_list = [(req_format, url_map[req_format])] # Specific format
2911 for format_param, video_real_url in video_url_list:
2913 # At this point we have a new video
2914 self._downloader.increment_downloads()
2917 video_extension = self._video_extensions.get(format_param, 'mp4')
2920 # Process video information
2921 self._downloader.process_info({
2922 'id': video_id.decode('utf-8'),
2923 'url': video_real_url.decode('utf-8'),
2924 'uploader': video_uploader.decode('utf-8'),
2925 'upload_date': upload_date,
2926 'title': video_title,
2927 'stitle': simple_title,
2928 'ext': video_extension.decode('utf-8'),
2929 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2930 'thumbnail': video_thumbnail.decode('utf-8'),
2931 'description': video_description.decode('utf-8'),
2934 except UnavailableVideoError, err:
2935 self._downloader.trouble(u'\nERROR: unable to download video')
2937 class BlipTVIE(InfoExtractor):
2938 """Information extractor for blip.tv"""
2940 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2941 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2942 IE_NAME = u'blip.tv'
2944 def report_extraction(self, file_id):
2945 """Report information extraction."""
2946 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2948 def report_direct_download(self, title):
2949 """Report information extraction."""
2950 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2952 def _real_extract(self, url):
2953 mobj = re.match(self._VALID_URL, url)
2955 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2962 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2963 request = urllib2.Request(json_url)
2964 self.report_extraction(mobj.group(1))
2967 urlh = urllib2.urlopen(request)
2968 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2969 basename = url.split('/')[-1]
2970 title,ext = os.path.splitext(basename)
2971 title = title.decode('UTF-8')
2972 ext = ext.replace('.', '')
2973 self.report_direct_download(title)
2978 'stitle': _simplify_title(title),
2982 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2983 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2985 if info is None: # Regular URL
2987 json_code = urlh.read()
2988 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2989 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2993 json_data = json.loads(json_code)
2994 if 'Post' in json_data:
2995 data = json_data['Post']
2999 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3000 video_url = data['media']['url']
3001 umobj = re.match(self._URL_EXT, video_url)
3003 raise ValueError('Can not determine filename extension')
3004 ext = umobj.group(1)
3007 'id': data['item_id'],
3009 'uploader': data['display_name'],
3010 'upload_date': upload_date,
3011 'title': data['title'],
3012 'stitle': _simplify_title(data['title']),
3014 'format': data['media']['mimeType'],
3015 'thumbnail': data['thumbnailUrl'],
3016 'description': data['description'],
3017 'player_url': data['embedUrl']
3019 except (ValueError,KeyError), err:
3020 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3023 self._downloader.increment_downloads()
3026 self._downloader.process_info(info)
3027 except UnavailableVideoError, err:
3028 self._downloader.trouble(u'\nERROR: unable to download video')
3031 class MyVideoIE(InfoExtractor):
3032 """Information Extractor for myvideo.de."""
3034 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3035 IE_NAME = u'myvideo'
3037 def __init__(self, downloader=None):
3038 InfoExtractor.__init__(self, downloader)
3040 def report_download_webpage(self, video_id):
3041 """Report webpage download."""
3042 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3044 def report_extraction(self, video_id):
3045 """Report information extraction."""
3046 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3048 def _real_extract(self,url):
3049 mobj = re.match(self._VALID_URL, url)
3051 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3054 video_id = mobj.group(1)
3057 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3059 self.report_download_webpage(video_id)
3060 webpage = urllib2.urlopen(request).read()
3061 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3062 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3065 self.report_extraction(video_id)
3066 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3069 self._downloader.trouble(u'ERROR: unable to extract media URL')
3071 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3073 mobj = re.search('<title>([^<]+)</title>', webpage)
3075 self._downloader.trouble(u'ERROR: unable to extract title')
3078 video_title = mobj.group(1)
3079 video_title = sanitize_title(video_title)
3081 simple_title = _simplify_title(video_title)
3084 self._downloader.process_info({
3088 'upload_date': u'NA',
3089 'title': video_title,
3090 'stitle': simple_title,
3095 except UnavailableVideoError:
3096 self._downloader.trouble(u'\nERROR: Unable to download video')
3098 class ComedyCentralIE(InfoExtractor):
3099 """Information extractor for The Daily Show and Colbert Report """
3101 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3102 IE_NAME = u'comedycentral'
3104 def report_extraction(self, episode_id):
3105 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3107 def report_config_download(self, episode_id):
3108 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3110 def report_index_download(self, episode_id):
3111 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3113 def report_player_url(self, episode_id):
3114 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3116 def _real_extract(self, url):
3117 mobj = re.match(self._VALID_URL, url)
3119 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3122 if mobj.group('shortname'):
3123 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3124 url = u'http://www.thedailyshow.com/full-episodes/'
3126 url = u'http://www.colbertnation.com/full-episodes/'
3127 mobj = re.match(self._VALID_URL, url)
3128 assert mobj is not None
3130 dlNewest = not mobj.group('episode')
3132 epTitle = mobj.group('showname')
3134 epTitle = mobj.group('episode')
3136 req = urllib2.Request(url)
3137 self.report_extraction(epTitle)
3139 htmlHandle = urllib2.urlopen(req)
3140 html = htmlHandle.read()
3141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3142 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3145 url = htmlHandle.geturl()
3146 mobj = re.match(self._VALID_URL, url)
3148 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3150 if mobj.group('episode') == '':
3151 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3153 epTitle = mobj.group('episode')
3155 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3156 if len(mMovieParams) == 0:
3157 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3160 playerUrl_raw = mMovieParams[0][0]
3161 self.report_player_url(epTitle)
3163 urlHandle = urllib2.urlopen(playerUrl_raw)
3164 playerUrl = urlHandle.geturl()
3165 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3166 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3169 uri = mMovieParams[0][1]
3170 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3171 self.report_index_download(epTitle)
3173 indexXml = urllib2.urlopen(indexUrl).read()
3174 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3175 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3178 idoc = xml.etree.ElementTree.fromstring(indexXml)
3179 itemEls = idoc.findall('.//item')
3180 for itemEl in itemEls:
3181 mediaId = itemEl.findall('./guid')[0].text
3182 shortMediaId = mediaId.split(':')[-1]
3183 showId = mediaId.split(':')[-2].replace('.com', '')
3184 officialTitle = itemEl.findall('./title')[0].text
3185 officialDate = itemEl.findall('./pubDate')[0].text
3187 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3188 urllib.urlencode({'uri': mediaId}))
3189 configReq = urllib2.Request(configUrl)
3190 self.report_config_download(epTitle)
3192 configXml = urllib2.urlopen(configReq).read()
3193 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3194 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3197 cdoc = xml.etree.ElementTree.fromstring(configXml)
3199 for rendition in cdoc.findall('.//rendition'):
3200 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3204 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3207 # For now, just pick the highest bitrate
3208 format,video_url = turls[-1]
3210 self._downloader.increment_downloads()
3212 effTitle = showId + u'-' + epTitle
3217 'upload_date': officialDate,
3219 'stitle': _simplify_title(effTitle),
3223 'description': officialTitle,
3224 'player_url': playerUrl
3228 self._downloader.process_info(info)
3229 except UnavailableVideoError, err:
3230 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3234 class EscapistIE(InfoExtractor):
3235 """Information extractor for The Escapist """
3237 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3238 IE_NAME = u'escapist'
3240 def report_extraction(self, showName):
3241 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3243 def report_config_download(self, showName):
3244 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3246 def _real_extract(self, url):
3247 htmlParser = HTMLParser.HTMLParser()
3249 mobj = re.match(self._VALID_URL, url)
3251 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3253 showName = mobj.group('showname')
3254 videoId = mobj.group('episode')
3256 self.report_extraction(showName)
3258 webPage = urllib2.urlopen(url).read()
3259 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3260 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3263 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3264 description = htmlParser.unescape(descMatch.group(1))
3265 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3266 imgUrl = htmlParser.unescape(imgMatch.group(1))
3267 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3268 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3269 configUrlMatch = re.search('config=(.*)$', playerUrl)
3270 configUrl = urllib2.unquote(configUrlMatch.group(1))
3272 self.report_config_download(showName)
3274 configJSON = urllib2.urlopen(configUrl).read()
3275 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3276 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3279 # Technically, it's JavaScript, not JSON
3280 configJSON = configJSON.replace("'", '"')
3283 config = json.loads(configJSON)
3284 except (ValueError,), err:
3285 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3288 playlist = config['playlist']
3289 videoUrl = playlist[1]['url']
3291 self._downloader.increment_downloads()
3295 'uploader': showName,
3296 'upload_date': None,
3298 'stitle': _simplify_title(showName),
3301 'thumbnail': imgUrl,
3302 'description': description,
3303 'player_url': playerUrl,
3307 self._downloader.process_info(info)
3308 except UnavailableVideoError, err:
3309 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3312 class CollegeHumorIE(InfoExtractor):
3313 """Information extractor for collegehumor.com"""
3315 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3316 IE_NAME = u'collegehumor'
3318 def report_webpage(self, video_id):
3319 """Report information extraction."""
3320 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3322 def report_extraction(self, video_id):
3323 """Report information extraction."""
3324 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3326 def _real_extract(self, url):
3327 htmlParser = HTMLParser.HTMLParser()
3329 mobj = re.match(self._VALID_URL, url)
3331 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3333 video_id = mobj.group('videoid')
3335 self.report_webpage(video_id)
3336 request = urllib2.Request(url)
3338 webpage = urllib2.urlopen(request).read()
3339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3340 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3343 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3345 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3347 internal_video_id = m.group('internalvideoid')
3351 'internal_id': internal_video_id,
3354 self.report_extraction(video_id)
3355 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3357 metaXml = urllib2.urlopen(xmlUrl).read()
3358 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3359 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3362 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3364 videoNode = mdoc.findall('./video')[0]
3365 info['description'] = videoNode.findall('./description')[0].text
3366 info['title'] = videoNode.findall('./caption')[0].text
3367 info['stitle'] = _simplify_title(info['title'])
3368 info['url'] = videoNode.findall('./file')[0].text
3369 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3370 info['ext'] = info['url'].rpartition('.')[2]
3371 info['format'] = info['ext']
3373 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3376 self._downloader.increment_downloads()
3379 self._downloader.process_info(info)
3380 except UnavailableVideoError, err:
3381 self._downloader.trouble(u'\nERROR: unable to download video')
3384 class XVideosIE(InfoExtractor):
3385 """Information extractor for xvideos.com"""
3387 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3388 IE_NAME = u'xvideos'
3390 def report_webpage(self, video_id):
3391 """Report information extraction."""
3392 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3394 def report_extraction(self, video_id):
3395 """Report information extraction."""
3396 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3398 def _real_extract(self, url):
3399 htmlParser = HTMLParser.HTMLParser()
3401 mobj = re.match(self._VALID_URL, url)
3403 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3405 video_id = mobj.group(1).decode('utf-8')
3407 self.report_webpage(video_id)
3409 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3411 webpage = urllib2.urlopen(request).read()
3412 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3413 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3416 self.report_extraction(video_id)
3420 mobj = re.search(r'flv_url=(.+?)&', webpage)
3422 self._downloader.trouble(u'ERROR: unable to extract video url')
3424 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3428 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3430 self._downloader.trouble(u'ERROR: unable to extract video title')
3432 video_title = mobj.group(1).decode('utf-8')
3435 # Extract video thumbnail
3436 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3438 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3440 video_thumbnail = mobj.group(1).decode('utf-8')
3444 self._downloader.increment_downloads()
3449 'upload_date': None,
3450 'title': video_title,
3451 'stitle': _simplify_title(video_title),
3454 'thumbnail': video_thumbnail,
3455 'description': None,
3460 self._downloader.process_info(info)
3461 except UnavailableVideoError, err:
3462 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3465 class SoundcloudIE(InfoExtractor):
3466 """Information extractor for soundcloud.com
3467 To access the media, the uid of the song and a stream token
3468 must be extracted from the page source and the script must make
3469 a request to media.soundcloud.com/crossdomain.xml. Then
3470 the media can be grabbed by requesting from an url composed
3471 of the stream token and uid
3474 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3475 IE_NAME = u'soundcloud'
3477 def __init__(self, downloader=None):
3478 InfoExtractor.__init__(self, downloader)
3480 def report_webpage(self, video_id):
3481 """Report information extraction."""
3482 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3484 def report_extraction(self, video_id):
3485 """Report information extraction."""
3486 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3488 def _real_extract(self, url):
3489 htmlParser = HTMLParser.HTMLParser()
3491 mobj = re.match(self._VALID_URL, url)
3493 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3496 # extract uploader (which is in the url)
3497 uploader = mobj.group(1).decode('utf-8')
3498 # extract simple title (uploader + slug of song title)
3499 slug_title = mobj.group(2).decode('utf-8')
3500 simple_title = uploader + '-' + slug_title
3502 self.report_webpage('%s/%s' % (uploader, slug_title))
3504 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3506 webpage = urllib2.urlopen(request).read()
3507 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3508 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3511 self.report_extraction('%s/%s' % (uploader, slug_title))
3513 # extract uid and stream token that soundcloud hands out for access
3514 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3516 video_id = mobj.group(1)
3517 stream_token = mobj.group(2)
3519 # extract unsimplified title
3520 mobj = re.search('"title":"(.*?)",', webpage)
3522 title = mobj.group(1)
3524 # construct media url (with uid/token)
3525 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3526 mediaURL = mediaURL % (video_id, stream_token)
3529 description = u'No description available'
3530 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3532 description = mobj.group(1)
3536 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3539 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3540 except Exception, e:
3543 # for soundcloud, a request to a cross domain is required for cookies
3544 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3547 self._downloader.process_info({
3548 'id': video_id.decode('utf-8'),
3550 'uploader': uploader.decode('utf-8'),
3551 'upload_date': upload_date,
3552 'title': simple_title.decode('utf-8'),
3553 'stitle': simple_title.decode('utf-8'),
3557 'description': description.decode('utf-8')
3559 except UnavailableVideoError:
3560 self._downloader.trouble(u'\nERROR: unable to download video')
3563 class InfoQIE(InfoExtractor):
3564 """Information extractor for infoq.com"""
3566 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3569 def report_webpage(self, video_id):
3570 """Report information extraction."""
3571 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3573 def report_extraction(self, video_id):
3574 """Report information extraction."""
3575 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3577 def _real_extract(self, url):
3578 htmlParser = HTMLParser.HTMLParser()
3580 mobj = re.match(self._VALID_URL, url)
3582 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3585 self.report_webpage(url)
3587 request = urllib2.Request(url)
3589 webpage = urllib2.urlopen(request).read()
3590 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3591 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3594 self.report_extraction(url)
3598 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3600 self._downloader.trouble(u'ERROR: unable to extract video url')
3602 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3606 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3608 self._downloader.trouble(u'ERROR: unable to extract video title')
3610 video_title = mobj.group(1).decode('utf-8')
3612 # Extract description
3613 video_description = u'No description available.'
3614 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3615 if mobj is not None:
3616 video_description = mobj.group(1).decode('utf-8')
3618 video_filename = video_url.split('/')[-1]
3619 video_id, extension = video_filename.split('.')
3621 self._downloader.increment_downloads()
3626 'upload_date': None,
3627 'title': video_title,
3628 'stitle': _simplify_title(video_title),
3630 'format': extension, # Extension is always(?) mp4, but seems to be flv
3632 'description': video_description,
3637 self._downloader.process_info(info)
3638 except UnavailableVideoError, err:
3639 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3641 class MixcloudIE(InfoExtractor):
3642 """Information extractor for www.mixcloud.com"""
3643 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3644 IE_NAME = u'mixcloud'
3646 def __init__(self, downloader=None):
3647 InfoExtractor.__init__(self, downloader)
3649 def report_download_json(self, file_id):
3650 """Report JSON download."""
3651 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3653 def report_extraction(self, file_id):
3654 """Report information extraction."""
3655 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3657 def get_urls(self, jsonData, fmt, bitrate='best'):
3658 """Get urls from 'audio_formats' section in json"""
3661 bitrate_list = jsonData[fmt]
3662 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3663 bitrate = max(bitrate_list) # select highest
3665 url_list = jsonData[fmt][bitrate]
3666 except TypeError: # we have no bitrate info.
3667 url_list = jsonData[fmt]
3671 def check_urls(self, url_list):
3672 """Returns 1st active url from list"""
3673 for url in url_list:
3675 urllib2.urlopen(url)
3677 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3682 def _print_formats(self, formats):
3683 print 'Available formats:'
3684 for fmt in formats.keys():
3685 for b in formats[fmt]:
3687 ext = formats[fmt][b][0]
3688 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3689 except TypeError: # we have no bitrate info
3690 ext = formats[fmt][0]
3691 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3694 def _real_extract(self, url):
3695 mobj = re.match(self._VALID_URL, url)
3697 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3699 # extract uploader & filename from url
3700 uploader = mobj.group(1).decode('utf-8')
3701 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3703 # construct API request
3704 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3705 # retrieve .json file with links to files
3706 request = urllib2.Request(file_url)
3708 self.report_download_json(file_url)
3709 jsonData = urllib2.urlopen(request).read()
3710 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3711 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3715 json_data = json.loads(jsonData)
3716 player_url = json_data['player_swf_url']
3717 formats = dict(json_data['audio_formats'])
3719 req_format = self._downloader.params.get('format', None)
3722 if self._downloader.params.get('listformats', None):
3723 self._print_formats(formats)
3726 if req_format is None or req_format == 'best':
3727 for format_param in formats.keys():
3728 url_list = self.get_urls(formats, format_param)
3730 file_url = self.check_urls(url_list)
3731 if file_url is not None:
3734 if req_format not in formats.keys():
3735 self._downloader.trouble(u'ERROR: format is not available')
3738 url_list = self.get_urls(formats, req_format)
3739 file_url = self.check_urls(url_list)
3740 format_param = req_format
3743 self._downloader.increment_downloads()
3745 # Process file information
3746 self._downloader.process_info({
3747 'id': file_id.decode('utf-8'),
3748 'url': file_url.decode('utf-8'),
3749 'uploader': uploader.decode('utf-8'),
3750 'upload_date': u'NA',
3751 'title': json_data['name'],
3752 'stitle': _simplify_title(json_data['name']),
3753 'ext': file_url.split('.')[-1].decode('utf-8'),
3754 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3755 'thumbnail': json_data['thumbnail_url'],
3756 'description': json_data['description'],
3757 'player_url': player_url.decode('utf-8'),
3759 except UnavailableVideoError, err:
3760 self._downloader.trouble(u'ERROR: unable to download file')
3762 class StanfordOpenClassroomIE(InfoExtractor):
3763 """Information extractor for Stanford's Open ClassRoom"""
3765 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3766 IE_NAME = u'stanfordoc'
3768 def report_download_webpage(self, objid):
3769 """Report information extraction."""
3770 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3772 def report_extraction(self, video_id):
3773 """Report information extraction."""
3774 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3776 def _real_extract(self, url):
3777 mobj = re.match(self._VALID_URL, url)
3779 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3782 if mobj.group('course') and mobj.group('video'): # A specific video
3783 course = mobj.group('course')
3784 video = mobj.group('video')
3786 'id': _simplify_title(course + '_' + video),
3789 self.report_extraction(info['id'])
3790 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3791 xmlUrl = baseUrl + video + '.xml'
3793 metaXml = urllib2.urlopen(xmlUrl).read()
3794 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3795 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3797 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3799 info['title'] = mdoc.findall('./title')[0].text
3800 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3802 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3804 info['stitle'] = _simplify_title(info['title'])
3805 info['ext'] = info['url'].rpartition('.')[2]
3806 info['format'] = info['ext']
3807 self._downloader.increment_downloads()
3809 self._downloader.process_info(info)
3810 except UnavailableVideoError, err:
3811 self._downloader.trouble(u'\nERROR: unable to download video')
3812 elif mobj.group('course'): # A course page
3813 unescapeHTML = HTMLParser.HTMLParser().unescape
3815 course = mobj.group('course')
3817 'id': _simplify_title(course),
3821 self.report_download_webpage(info['id'])
3823 coursepage = urllib2.urlopen(url).read()
3824 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3825 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3828 m = re.search('<h1>([^<]+)</h1>', coursepage)
3830 info['title'] = unescapeHTML(m.group(1))
3832 info['title'] = info['id']
3833 info['stitle'] = _simplify_title(info['title'])
3835 m = re.search('<description>([^<]+)</description>', coursepage)
3837 info['description'] = unescapeHTML(m.group(1))
3839 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3842 'type': 'reference',
3843 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3847 for entry in info['list']:
3848 assert entry['type'] == 'reference'
3849 self.extract(entry['url'])
3851 unescapeHTML = HTMLParser.HTMLParser().unescape
3854 'id': 'Stanford OpenClassroom',
3858 self.report_download_webpage(info['id'])
3859 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3861 rootpage = urllib2.urlopen(rootURL).read()
3862 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3863 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3866 info['title'] = info['id']
3867 info['stitle'] = _simplify_title(info['title'])
3869 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3872 'type': 'reference',
3873 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3877 for entry in info['list']:
3878 assert entry['type'] == 'reference'
3879 self.extract(entry['url'])
3882 class PostProcessor(object):
3883 """Post Processor class.
3885 PostProcessor objects can be added to downloaders with their
3886 add_post_processor() method. When the downloader has finished a
3887 successful download, it will take its internal chain of PostProcessors
3888 and start calling the run() method on each one of them, first with
3889 an initial argument and then with the returned value of the previous
3892 The chain will be stopped if one of them ever returns None or the end
3893 of the chain is reached.
3895 PostProcessor objects follow a "mutual registration" process similar
3896 to InfoExtractor objects.
3901 def __init__(self, downloader=None):
3902 self._downloader = downloader
3904 def set_downloader(self, downloader):
3905 """Sets the downloader for this PP."""
3906 self._downloader = downloader
3908 def run(self, information):
3909 """Run the PostProcessor.
3911 The "information" argument is a dictionary like the ones
3912 composed by InfoExtractors. The only difference is that this
3913 one has an extra field called "filepath" that points to the
3916 When this method returns None, the postprocessing chain is
3917 stopped. However, this method may return an information
3918 dictionary that will be passed to the next postprocessing
3919 object in the chain. It can be the one it received after
3920 changing some fields.
3922 In addition, this method may raise a PostProcessingError
3923 exception that will be taken into account by the downloader
3926 return information # by default, do nothing
3929 class FFmpegExtractAudioPP(PostProcessor):
3931 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3932 PostProcessor.__init__(self, downloader)
3933 if preferredcodec is None:
3934 preferredcodec = 'best'
3935 self._preferredcodec = preferredcodec
3936 self._preferredquality = preferredquality
3937 self._keepvideo = keepvideo
3940 def get_audio_codec(path):
3942 cmd = ['ffprobe', '-show_streams', '--', path]
3943 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3944 output = handle.communicate()[0]
3945 if handle.wait() != 0:
3947 except (IOError, OSError):
3950 for line in output.split('\n'):
3951 if line.startswith('codec_name='):
3952 audio_codec = line.split('=')[1].strip()
3953 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3958 def run_ffmpeg(path, out_path, codec, more_opts):
3960 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3961 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3963 except (IOError, OSError):
3966 def run(self, information):
3967 path = information['filepath']
3969 filecodec = self.get_audio_codec(path)
3970 if filecodec is None:
3971 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3975 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3976 if self._preferredcodec == 'm4a' and filecodec == 'aac':
3977 # Lossless, but in another container
3979 extension = self._preferredcodec
3980 more_opts = ['-absf', 'aac_adtstoasc']
3981 elif filecodec in ['aac', 'mp3', 'vorbis']:
3982 # Lossless if possible
3984 extension = filecodec
3985 if filecodec == 'aac':
3986 more_opts = ['-f', 'adts']
3987 if filecodec == 'vorbis':
3991 acodec = 'libmp3lame'
3994 if self._preferredquality is not None:
3995 more_opts += ['-ab', self._preferredquality]
3997 # We convert the audio (lossy)
3998 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3999 extension = self._preferredcodec
4001 if self._preferredquality is not None:
4002 more_opts += ['-ab', self._preferredquality]
4003 if self._preferredcodec == 'aac':
4004 more_opts += ['-f', 'adts']
4005 if self._preferredcodec == 'm4a':
4006 more_opts += ['-absf', 'aac_adtstoasc']
4007 if self._preferredcodec == 'vorbis':
4010 (prefix, ext) = os.path.splitext(path)
4011 new_path = prefix + '.' + extension
4012 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4013 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
4016 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
4019 # Try to update the date time for extracted audio file.
4020 if information.get('filetime') is not None:
4022 os.utime(new_path, (time.time(), information['filetime']))
4024 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4026 if not self._keepvideo:
4029 except (IOError, OSError):
4030 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4033 information['filepath'] = new_path
4037 def updateSelf(downloader, filename):
4038 ''' Update the program file with the latest version from the repository '''
4039 # Note: downloader only used for options
4040 if not os.access(filename, os.W_OK):
4041 sys.exit('ERROR: no write permissions on %s' % filename)
4043 downloader.to_screen('Updating to latest version...')
4047 urlh = urllib.urlopen(UPDATE_URL)
4048 newcontent = urlh.read()
4050 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4051 if vmatch is not None and vmatch.group(1) == __version__:
4052 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4056 except (IOError, OSError), err:
4057 sys.exit('ERROR: unable to download latest version')
4060 outf = open(filename, 'wb')
4062 outf.write(newcontent)
4065 except (IOError, OSError), err:
4066 sys.exit('ERROR: unable to overwrite current version')
4068 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4076 def _readOptions(filename):
4078 optionf = open(filename)
4080 return [] # silently skip if file is not present
4084 res += shlex.split(l, comments=True)
4089 def _format_option_string(option):
4090 ''' ('-o', '--option') -> -o, --format METAVAR'''
4094 if option._short_opts: opts.append(option._short_opts[0])
4095 if option._long_opts: opts.append(option._long_opts[0])
4096 if len(opts) > 1: opts.insert(1, ', ')
4098 if option.takes_value(): opts.append(' %s' % option.metavar)
4100 return "".join(opts)
4102 def _find_term_columns():
4103 columns = os.environ.get('COLUMNS', None)
4108 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4109 out,err = sp.communicate()
4110 return int(out.split()[1])
4116 max_help_position = 80
4118 # No need to wrap help messages if we're on a wide console
4119 columns = _find_term_columns()
4120 if columns: max_width = columns
4122 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4123 fmt.format_option_strings = _format_option_string
4126 'version' : __version__,
4128 'usage' : '%prog [options] url [url...]',
4129 'conflict_handler' : 'resolve',
4132 parser = optparse.OptionParser(**kw)
4135 general = optparse.OptionGroup(parser, 'General Options')
4136 selection = optparse.OptionGroup(parser, 'Video Selection')
4137 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4138 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4139 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4140 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4141 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4143 general.add_option('-h', '--help',
4144 action='help', help='print this help text and exit')
4145 general.add_option('-v', '--version',
4146 action='version', help='print program version and exit')
4147 general.add_option('-U', '--update',
4148 action='store_true', dest='update_self', help='update this program to latest version')
4149 general.add_option('-i', '--ignore-errors',
4150 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4151 general.add_option('-r', '--rate-limit',
4152 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4153 general.add_option('-R', '--retries',
4154 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4155 general.add_option('--dump-user-agent',
4156 action='store_true', dest='dump_user_agent',
4157 help='display the current browser identification', default=False)
4158 general.add_option('--list-extractors',
4159 action='store_true', dest='list_extractors',
4160 help='List all supported extractors and the URLs they would handle', default=False)
4162 selection.add_option('--playlist-start',
4163 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4164 selection.add_option('--playlist-end',
4165 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4166 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4167 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4168 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4170 authentication.add_option('-u', '--username',
4171 dest='username', metavar='USERNAME', help='account username')
4172 authentication.add_option('-p', '--password',
4173 dest='password', metavar='PASSWORD', help='account password')
4174 authentication.add_option('-n', '--netrc',
4175 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4178 video_format.add_option('-f', '--format',
4179 action='store', dest='format', metavar='FORMAT', help='video format code')
4180 video_format.add_option('--all-formats',
4181 action='store_const', dest='format', help='download all available video formats', const='all')
4182 video_format.add_option('--prefer-free-formats',
4183 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4184 video_format.add_option('--max-quality',
4185 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4186 video_format.add_option('-F', '--list-formats',
4187 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4190 verbosity.add_option('-q', '--quiet',
4191 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4192 verbosity.add_option('-s', '--simulate',
4193 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4194 verbosity.add_option('--skip-download',
4195 action='store_true', dest='skip_download', help='do not download the video', default=False)
4196 verbosity.add_option('-g', '--get-url',
4197 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4198 verbosity.add_option('-e', '--get-title',
4199 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4200 verbosity.add_option('--get-thumbnail',
4201 action='store_true', dest='getthumbnail',
4202 help='simulate, quiet but print thumbnail URL', default=False)
4203 verbosity.add_option('--get-description',
4204 action='store_true', dest='getdescription',
4205 help='simulate, quiet but print video description', default=False)
4206 verbosity.add_option('--get-filename',
4207 action='store_true', dest='getfilename',
4208 help='simulate, quiet but print output filename', default=False)
4209 verbosity.add_option('--get-format',
4210 action='store_true', dest='getformat',
4211 help='simulate, quiet but print output format', default=False)
4212 verbosity.add_option('--no-progress',
4213 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4214 verbosity.add_option('--console-title',
4215 action='store_true', dest='consoletitle',
4216 help='display progress in console titlebar', default=False)
4219 filesystem.add_option('-t', '--title',
4220 action='store_true', dest='usetitle', help='use title in file name', default=False)
4221 filesystem.add_option('-l', '--literal',
4222 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4223 filesystem.add_option('-A', '--auto-number',
4224 action='store_true', dest='autonumber',
4225 help='number downloaded files starting from 00000', default=False)
4226 filesystem.add_option('-o', '--output',
4227 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4228 filesystem.add_option('-a', '--batch-file',
4229 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4230 filesystem.add_option('-w', '--no-overwrites',
4231 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4232 filesystem.add_option('-c', '--continue',
4233 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4234 filesystem.add_option('--no-continue',
4235 action='store_false', dest='continue_dl',
4236 help='do not resume partially downloaded files (restart from beginning)')
4237 filesystem.add_option('--cookies',
4238 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4239 filesystem.add_option('--no-part',
4240 action='store_true', dest='nopart', help='do not use .part files', default=False)
4241 filesystem.add_option('--no-mtime',
4242 action='store_false', dest='updatetime',
4243 help='do not use the Last-modified header to set the file modification time', default=True)
4244 filesystem.add_option('--write-description',
4245 action='store_true', dest='writedescription',
4246 help='write video description to a .description file', default=False)
4247 filesystem.add_option('--write-info-json',
4248 action='store_true', dest='writeinfojson',
4249 help='write video metadata to a .info.json file', default=False)
4252 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4253 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4254 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4255 help='"best", "aac", "vorbis", "mp3", or "m4a"; best by default')
4256 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4257 help='ffmpeg audio bitrate specification, 128k by default')
4258 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4259 help='keeps the video file on disk after the post-processing; the video is erased by default')
4262 parser.add_option_group(general)
4263 parser.add_option_group(selection)
4264 parser.add_option_group(filesystem)
4265 parser.add_option_group(verbosity)
4266 parser.add_option_group(video_format)
4267 parser.add_option_group(authentication)
4268 parser.add_option_group(postproc)
4270 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4272 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4274 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4275 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4276 opts, args = parser.parse_args(argv)
4278 return parser, opts, args
4280 def gen_extractors():
4281 """ Return a list of an instance of every supported extractor.
4282 The order does matter; the first extractor matched is the one handling the URL.
4284 youtube_ie = YoutubeIE()
4285 google_ie = GoogleIE()
4286 yahoo_ie = YahooIE()
4288 YoutubePlaylistIE(youtube_ie),
4289 YoutubeUserIE(youtube_ie),
4290 YoutubeSearchIE(youtube_ie),
4292 MetacafeIE(youtube_ie),
4295 GoogleSearchIE(google_ie),
4298 YahooSearchIE(yahoo_ie),
4311 StanfordOpenClassroomIE(),
4317 parser, opts, args = parseOpts()
4319 # Open appropriate CookieJar
4320 if opts.cookiefile is None:
4321 jar = cookielib.CookieJar()
4324 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4325 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4327 except (IOError, OSError), err:
4328 sys.exit(u'ERROR: unable to open cookie file')
4331 if opts.dump_user_agent:
4332 print std_headers['User-Agent']
4335 # Batch file verification
4337 if opts.batchfile is not None:
4339 if opts.batchfile == '-':
4342 batchfd = open(opts.batchfile, 'r')
4343 batchurls = batchfd.readlines()
4344 batchurls = [x.strip() for x in batchurls]
4345 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4347 sys.exit(u'ERROR: batch file could not be read')
4348 all_urls = batchurls + args
4350 # General configuration
4351 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4352 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4353 urllib2.install_opener(opener)
4354 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4356 extractors = gen_extractors()
4358 if opts.list_extractors:
4359 for ie in extractors:
4361 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4362 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4363 for mu in matchedUrls:
4367 # Conflicting, missing and erroneous options
4368 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4369 parser.error(u'using .netrc conflicts with giving username/password')
4370 if opts.password is not None and opts.username is None:
4371 parser.error(u'account username missing')
4372 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4373 parser.error(u'using output template conflicts with using title, literal title or auto number')
4374 if opts.usetitle and opts.useliteral:
4375 parser.error(u'using title conflicts with using literal title')
4376 if opts.username is not None and opts.password is None:
4377 opts.password = getpass.getpass(u'Type account password and press return:')
4378 if opts.ratelimit is not None:
4379 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4380 if numeric_limit is None:
4381 parser.error(u'invalid rate limit specified')
4382 opts.ratelimit = numeric_limit
4383 if opts.retries is not None:
4385 opts.retries = long(opts.retries)
4386 except (TypeError, ValueError), err:
4387 parser.error(u'invalid retry count specified')
4389 opts.playliststart = int(opts.playliststart)
4390 if opts.playliststart <= 0:
4391 raise ValueError(u'Playlist start must be positive')
4392 except (TypeError, ValueError), err:
4393 parser.error(u'invalid playlist start number specified')
4395 opts.playlistend = int(opts.playlistend)
4396 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4397 raise ValueError(u'Playlist end must be greater than playlist start')
4398 except (TypeError, ValueError), err:
4399 parser.error(u'invalid playlist end number specified')
4400 if opts.extractaudio:
4401 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a']:
4402 parser.error(u'invalid audio format specified')
4405 fd = FileDownloader({
4406 'usenetrc': opts.usenetrc,
4407 'username': opts.username,
4408 'password': opts.password,
4409 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4410 'forceurl': opts.geturl,
4411 'forcetitle': opts.gettitle,
4412 'forcethumbnail': opts.getthumbnail,
4413 'forcedescription': opts.getdescription,
4414 'forcefilename': opts.getfilename,
4415 'forceformat': opts.getformat,
4416 'simulate': opts.simulate,
4417 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4418 'format': opts.format,
4419 'format_limit': opts.format_limit,
4420 'listformats': opts.listformats,
4421 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4422 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4423 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4424 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4425 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4426 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4427 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4428 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4429 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4430 or u'%(id)s.%(ext)s'),
4431 'ignoreerrors': opts.ignoreerrors,
4432 'ratelimit': opts.ratelimit,
4433 'nooverwrites': opts.nooverwrites,
4434 'retries': opts.retries,
4435 'continuedl': opts.continue_dl,
4436 'noprogress': opts.noprogress,
4437 'playliststart': opts.playliststart,
4438 'playlistend': opts.playlistend,
4439 'logtostderr': opts.outtmpl == '-',
4440 'consoletitle': opts.consoletitle,
4441 'nopart': opts.nopart,
4442 'updatetime': opts.updatetime,
4443 'writedescription': opts.writedescription,
4444 'writeinfojson': opts.writeinfojson,
4445 'matchtitle': opts.matchtitle,
4446 'rejecttitle': opts.rejecttitle,
4447 'max_downloads': opts.max_downloads,
4448 'prefer_free_formats': opts.prefer_free_formats,
4450 for extractor in extractors:
4451 fd.add_info_extractor(extractor)
4454 if opts.extractaudio:
4455 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4458 if opts.update_self:
4459 updateSelf(fd, sys.argv[0])
4462 if len(all_urls) < 1:
4463 if not opts.update_self:
4464 parser.error(u'you must provide at least one URL')
4469 retcode = fd.download(all_urls)
4470 except MaxDownloadsReached:
4471 fd.to_screen(u'--max-download limit reached, aborting.')
4474 # Dump cookie jar if requested
4475 if opts.cookiefile is not None:
4478 except (IOError, OSError), err:
4479 sys.exit(u'ERROR: unable to save cookie jar')
4486 except DownloadError:
4488 except SameFileError:
4489 sys.exit(u'ERROR: fixed output name but more than one file to download')
4490 except KeyboardInterrupt:
4491 sys.exit(u'\nERROR: Interrupted by user')
4493 if __name__ == '__main__':
4496 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: