2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.15'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
635 def report_writedescription(self, descfn):
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
643 def report_destination(self, filename):
644 """Report destination filename."""
645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
649 if self.params.get('noprogress', False):
651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
656 def report_resuming_byte(self, resume_len):
657 """Report attempt to resume at given byte."""
658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
660 def report_retry(self, count, retries):
661 """Report retry in case of HTTP error 5xx"""
662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
668 except (UnicodeEncodeError), err:
669 self.to_screen(u'[download] The file has already been downloaded')
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
673 self.to_screen(u'[download] Unable to resume')
675 def report_finish(self):
676 """Report download finished."""
677 if self.params.get('noprogress', False):
678 self.to_screen(u'[download] Download completed')
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
700 filename = self.prepare_filename(info_dict)
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forceformat', False):
714 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
716 # Do nothing else if in simulate mode
717 if self.params.get('simulate', False):
723 matchtitle=self.params.get('matchtitle',False)
724 rejecttitle=self.params.get('rejecttitle',False)
725 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
729 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
733 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734 self.to_stderr(u'WARNING: file exists and will be skipped')
738 dn = os.path.dirname(filename)
739 if dn != '' and not os.path.exists(dn):
741 except (OSError, IOError), err:
742 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
745 if self.params.get('writedescription', False):
747 descfn = filename + '.description'
748 self.report_writedescription(descfn)
749 descfile = open(descfn, 'wb')
751 descfile.write(info_dict['description'].encode('utf-8'))
754 except (OSError, IOError):
755 self.trouble(u'ERROR: Cannot write description file ' + descfn)
758 if self.params.get('writeinfojson', False):
759 infofn = filename + '.info.json'
760 self.report_writeinfojson(infofn)
763 except (NameError,AttributeError):
764 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
767 infof = open(infofn, 'wb')
769 json.dump(info_dict, infof)
772 except (OSError, IOError):
773 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
776 if not self.params.get('skip_download', False):
778 success,add_data = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
779 info_dict.update(add_data)
780 except (OSError, IOError), err:
781 raise UnavailableVideoError
782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
785 except (ContentTooShortError, ), err:
786 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
791 self.post_process(filename, info_dict)
792 except (PostProcessingError), err:
793 self.trouble(u'ERROR: postprocessing: %s' % str(err))
796 def download(self, url_list):
797 """Download a given list of URLs."""
798 if len(url_list) > 1 and self.fixed_template():
799 raise SameFileError(self.params['outtmpl'])
802 suitable_found = False
804 # Go to next InfoExtractor if not suitable
805 if not ie.suitable(url):
808 # Suitable InfoExtractor found
809 suitable_found = True
811 # Extract information from URL and process it
814 # Suitable InfoExtractor had been found; go to next URL
817 if not suitable_found:
818 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
820 return self._download_retcode
822 def post_process(self, filename, ie_info):
823 """Run the postprocessing chain on the given file."""
825 info['filepath'] = filename
831 def _download_with_rtmpdump(self, filename, url, player_url):
832 self.report_destination(filename)
833 tmpfilename = self.temp_name(filename)
835 # Check for rtmpdump first
837 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
838 except (OSError, IOError):
839 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
842 # Download using rtmpdump. rtmpdump returns exit code 2 when
843 # the connection was interrumpted and resuming appears to be
844 # possible. This is part of rtmpdump's normal usage, AFAIK.
845 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
846 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
847 while retval == 2 or retval == 1:
848 prevsize = os.path.getsize(tmpfilename)
849 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
850 time.sleep(5.0) # This seems to be needed
851 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
852 cursize = os.path.getsize(tmpfilename)
853 if prevsize == cursize and retval == 1:
855 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
856 if prevsize == cursize and retval == 2 and cursize > 1024:
857 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
861 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
862 self.try_rename(tmpfilename, filename)
865 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
868 def _do_download(self, filename, url, player_url):
869 # Check file already present
870 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
871 self.report_file_already_downloaded(filename)
874 # Attempt to download using rtmpdump
875 if url.startswith('rtmp'):
876 return self._download_with_rtmpdump(filename, url, player_url)
878 tmpfilename = self.temp_name(filename)
882 # Do not include the Accept-Encoding header
883 headers = {'Youtubedl-no-compression': 'True'}
884 basic_request = urllib2.Request(url, None, headers)
885 request = urllib2.Request(url, None, headers)
887 # Establish possible resume length
888 if os.path.isfile(tmpfilename):
889 resume_len = os.path.getsize(tmpfilename)
893 # Request parameters in case of being able to resume
894 if self.params.get('continuedl', False) and resume_len != 0:
895 self.report_resuming_byte(resume_len)
896 request.add_header('Range', 'bytes=%d-' % resume_len)
900 retries = self.params.get('retries', 0)
901 while count <= retries:
902 # Establish connection
904 data = urllib2.urlopen(request)
906 except (urllib2.HTTPError, ), err:
907 if (err.code < 500 or err.code >= 600) and err.code != 416:
908 # Unexpected HTTP error
910 elif err.code == 416:
911 # Unable to resume (requested range not satisfiable)
913 # Open the connection again without the range header
914 data = urllib2.urlopen(basic_request)
915 content_length = data.info()['Content-Length']
916 except (urllib2.HTTPError, ), err:
917 if err.code < 500 or err.code >= 600:
920 # Examine the reported length
921 if (content_length is not None and
922 (resume_len - 100 < long(content_length) < resume_len + 100)):
923 # The file had already been fully downloaded.
924 # Explanation to the above condition: in issue #175 it was revealed that
925 # YouTube sometimes adds or removes a few bytes from the end of the file,
926 # changing the file size slightly and causing problems for some users. So
927 # I decided to implement a suggested change and consider the file
928 # completely downloaded if the file size differs less than 100 bytes from
929 # the one in the hard drive.
930 self.report_file_already_downloaded(filename)
931 self.try_rename(tmpfilename, filename)
934 # The length does not match, we start the download over
935 self.report_unable_to_resume()
941 self.report_retry(count, retries)
944 self.trouble(u'ERROR: giving up after %s retries' % retries)
947 data_len = data.info().get('Content-length', None)
948 if data_len is not None:
949 data_len = long(data_len) + resume_len
950 data_len_str = self.format_bytes(data_len)
951 byte_counter = 0 + resume_len
957 data_block = data.read(block_size)
959 if len(data_block) == 0:
961 byte_counter += len(data_block)
963 # Open file just in time
966 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
967 assert stream is not None
968 filename = self.undo_temp_name(tmpfilename)
969 self.report_destination(filename)
970 except (OSError, IOError), err:
971 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
974 stream.write(data_block)
975 except (IOError, OSError), err:
976 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
978 block_size = self.best_block_size(after - before, len(data_block))
981 percent_str = self.calc_percent(byte_counter, data_len)
982 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
983 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
984 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
987 self.slow_down(start, byte_counter - resume_len)
990 self.trouble(u'\nERROR: Did not get any data blocks')
994 if data_len is not None and byte_counter != data_len:
995 raise ContentTooShortError(byte_counter, long(data_len))
996 self.try_rename(tmpfilename, filename)
998 # Update file modification time
1000 if self.params.get('updatetime', True):
1001 filetime = self.try_utime(filename, data.info().get('last-modified', None))
1003 return True, {'filetime': filetime}
1006 class InfoExtractor(object):
1007 """Information Extractor class.
1009 Information extractors are the classes that, given a URL, extract
1010 information from the video (or videos) the URL refers to. This
1011 information includes the real video URL, the video title and simplified
1012 title, author and others. The information is stored in a dictionary
1013 which is then passed to the FileDownloader. The FileDownloader
1014 processes this information possibly downloading the video to the file
1015 system, among other possible outcomes. The dictionaries must include
1016 the following fields:
1018 id: Video identifier.
1019 url: Final video URL.
1020 uploader: Nickname of the video uploader.
1021 title: Literal title.
1022 stitle: Simplified title.
1023 ext: Video filename extension.
1024 format: Video format.
1025 player_url: SWF Player URL (may be None).
1027 The following fields are optional. Their primary purpose is to allow
1028 youtube-dl to serve as the backend for a video search function, such
1029 as the one in youtube2mp3. They are only used when their respective
1030 forced printing functions are called:
1032 thumbnail: Full URL to a video thumbnail image.
1033 description: One-line video description.
1035 Subclasses of this one should re-define the _real_initialize() and
1036 _real_extract() methods and define a _VALID_URL regexp.
1037 Probably, they should also be added to the list of extractors.
1043 def __init__(self, downloader=None):
1044 """Constructor. Receives an optional downloader."""
1046 self.set_downloader(downloader)
1048 def suitable(self, url):
1049 """Receives a URL and returns True if suitable for this IE."""
1050 return re.match(self._VALID_URL, url) is not None
1052 def initialize(self):
1053 """Initializes an instance (authentication, etc)."""
1055 self._real_initialize()
1058 def extract(self, url):
1059 """Extracts URL information and returns it in list of dicts."""
1061 return self._real_extract(url)
1063 def set_downloader(self, downloader):
1064 """Sets the downloader for this IE."""
1065 self._downloader = downloader
1067 def _real_initialize(self):
1068 """Real initialization process. Redefine in subclasses."""
1071 def _real_extract(self, url):
1072 """Real extraction process. Redefine in subclasses."""
1076 class YoutubeIE(InfoExtractor):
1077 """Information extractor for youtube.com."""
1079 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1080 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1081 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1082 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1083 _NETRC_MACHINE = 'youtube'
1084 # Listed in order of quality
1085 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1086 _video_extensions = {
1092 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1096 IE_NAME = u'youtube'
1098 def report_lang(self):
1099 """Report attempt to set language."""
1100 self._downloader.to_screen(u'[youtube] Setting language')
1102 def report_login(self):
1103 """Report attempt to log in."""
1104 self._downloader.to_screen(u'[youtube] Logging in')
1106 def report_age_confirmation(self):
1107 """Report attempt to confirm age."""
1108 self._downloader.to_screen(u'[youtube] Confirming age')
1110 def report_video_webpage_download(self, video_id):
1111 """Report attempt to download video webpage."""
1112 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1114 def report_video_info_webpage_download(self, video_id):
1115 """Report attempt to download video info webpage."""
1116 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1118 def report_information_extraction(self, video_id):
1119 """Report attempt to extract video information."""
1120 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1122 def report_unavailable_format(self, video_id, format):
1123 """Report extracted video URL."""
1124 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1126 def report_rtmp_download(self):
1127 """Indicate the download will use the RTMP protocol."""
1128 self._downloader.to_screen(u'[youtube] RTMP download detected')
1130 def _real_initialize(self):
1131 if self._downloader is None:
1136 downloader_params = self._downloader.params
1138 # Attempt to use provided username and password or .netrc data
1139 if downloader_params.get('username', None) is not None:
1140 username = downloader_params['username']
1141 password = downloader_params['password']
1142 elif downloader_params.get('usenetrc', False):
1144 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1145 if info is not None:
1149 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1150 except (IOError, netrc.NetrcParseError), err:
1151 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1155 request = urllib2.Request(self._LANG_URL)
1158 urllib2.urlopen(request).read()
1159 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1160 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1163 # No authentication to be performed
1164 if username is None:
1169 'current_form': 'loginForm',
1171 'action_login': 'Log In',
1172 'username': username,
1173 'password': password,
1175 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1178 login_results = urllib2.urlopen(request).read()
1179 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1180 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1182 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1183 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1189 'action_confirm': 'Confirm',
1191 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1193 self.report_age_confirmation()
1194 age_results = urllib2.urlopen(request).read()
1195 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1196 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1199 def _real_extract(self, url):
1200 # Extract video id from URL
1201 mobj = re.match(self._VALID_URL, url)
1203 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1205 video_id = mobj.group(2)
1208 self.report_video_webpage_download(video_id)
1209 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1211 video_webpage = urllib2.urlopen(request).read()
1212 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1213 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1216 # Attempt to extract SWF player URL
1217 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1218 if mobj is not None:
1219 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1224 self.report_video_info_webpage_download(video_id)
1225 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1226 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1227 % (video_id, el_type))
1228 request = urllib2.Request(video_info_url)
1230 video_info_webpage = urllib2.urlopen(request).read()
1231 video_info = parse_qs(video_info_webpage)
1232 if 'token' in video_info:
1234 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1235 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1237 if 'token' not in video_info:
1238 if 'reason' in video_info:
1239 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1241 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1244 # Start extracting information
1245 self.report_information_extraction(video_id)
1248 if 'author' not in video_info:
1249 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1251 video_uploader = urllib.unquote_plus(video_info['author'][0])
1254 if 'title' not in video_info:
1255 self._downloader.trouble(u'ERROR: unable to extract video title')
1257 video_title = urllib.unquote_plus(video_info['title'][0])
1258 video_title = video_title.decode('utf-8')
1259 video_title = sanitize_title(video_title)
1262 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1263 simple_title = simple_title.strip(ur'_')
1266 if 'thumbnail_url' not in video_info:
1267 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1268 video_thumbnail = ''
1269 else: # don't panic if we can't find it
1270 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1274 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1275 if mobj is not None:
1276 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1277 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1278 for expression in format_expressions:
1280 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1288 video_description = u'No description available.'
1289 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1290 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1291 if mobj is not None:
1292 video_description = mobj.group(1).decode('utf-8')
1294 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1295 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1296 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1297 # TODO use another parser
1300 video_token = urllib.unquote_plus(video_info['token'][0])
1302 # Decide which formats to download
1303 req_format = self._downloader.params.get('format', None)
1305 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1306 self.report_rtmp_download()
1307 video_url_list = [(None, video_info['conn'][0])]
1308 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1309 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1310 url_data = [parse_qs(uds) for uds in url_data_strs]
1311 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1312 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1314 format_limit = self._downloader.params.get('format_limit', None)
1315 if format_limit is not None and format_limit in self._available_formats:
1316 format_list = self._available_formats[self._available_formats.index(format_limit):]
1318 format_list = self._available_formats
1319 existing_formats = [x for x in format_list if x in url_map]
1320 if len(existing_formats) == 0:
1321 self._downloader.trouble(u'ERROR: no known formats available for video')
1323 if req_format is None:
1324 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1325 elif req_format == 'worst':
1326 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1327 elif req_format == '-1':
1328 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1331 if req_format not in url_map:
1332 self._downloader.trouble(u'ERROR: requested format not available')
1334 video_url_list = [(req_format, url_map[req_format])] # Specific format
1336 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1339 for format_param, video_real_url in video_url_list:
1340 # At this point we have a new video
1341 self._downloader.increment_downloads()
1344 video_extension = self._video_extensions.get(format_param, 'flv')
1347 # Process video information
1348 self._downloader.process_info({
1349 'id': video_id.decode('utf-8'),
1350 'url': video_real_url.decode('utf-8'),
1351 'uploader': video_uploader.decode('utf-8'),
1352 'upload_date': upload_date,
1353 'title': video_title,
1354 'stitle': simple_title,
1355 'ext': video_extension.decode('utf-8'),
1356 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1357 'thumbnail': video_thumbnail.decode('utf-8'),
1358 'description': video_description,
1359 'player_url': player_url,
1361 except UnavailableVideoError, err:
1362 self._downloader.trouble(u'\nERROR: unable to download video')
1365 class MetacafeIE(InfoExtractor):
1366 """Information Extractor for metacafe.com."""
1368 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1369 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1370 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1372 IE_NAME = u'metacafe'
1374 def __init__(self, youtube_ie, downloader=None):
1375 InfoExtractor.__init__(self, downloader)
1376 self._youtube_ie = youtube_ie
1378 def report_disclaimer(self):
1379 """Report disclaimer retrieval."""
1380 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1382 def report_age_confirmation(self):
1383 """Report attempt to confirm age."""
1384 self._downloader.to_screen(u'[metacafe] Confirming age')
1386 def report_download_webpage(self, video_id):
1387 """Report webpage download."""
1388 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1390 def report_extraction(self, video_id):
1391 """Report information extraction."""
1392 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1394 def _real_initialize(self):
1395 # Retrieve disclaimer
1396 request = urllib2.Request(self._DISCLAIMER)
1398 self.report_disclaimer()
1399 disclaimer = urllib2.urlopen(request).read()
1400 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1401 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1407 'submit': "Continue - I'm over 18",
1409 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1411 self.report_age_confirmation()
1412 disclaimer = urllib2.urlopen(request).read()
1413 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1414 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1417 def _real_extract(self, url):
1418 # Extract id and simplified title from URL
1419 mobj = re.match(self._VALID_URL, url)
1421 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1424 video_id = mobj.group(1)
1426 # Check if video comes from YouTube
1427 mobj2 = re.match(r'^yt-(.*)$', video_id)
1428 if mobj2 is not None:
1429 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1432 # At this point we have a new video
1433 self._downloader.increment_downloads()
1435 simple_title = mobj.group(2).decode('utf-8')
1437 # Retrieve video webpage to extract further information
1438 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1440 self.report_download_webpage(video_id)
1441 webpage = urllib2.urlopen(request).read()
1442 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1443 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1446 # Extract URL, uploader and title from webpage
1447 self.report_extraction(video_id)
1448 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1449 if mobj is not None:
1450 mediaURL = urllib.unquote(mobj.group(1))
1451 video_extension = mediaURL[-3:]
1453 # Extract gdaKey if available
1454 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1456 video_url = mediaURL
1458 gdaKey = mobj.group(1)
1459 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1461 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1463 self._downloader.trouble(u'ERROR: unable to extract media URL')
1465 vardict = parse_qs(mobj.group(1))
1466 if 'mediaData' not in vardict:
1467 self._downloader.trouble(u'ERROR: unable to extract media URL')
1469 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1471 self._downloader.trouble(u'ERROR: unable to extract media URL')
1473 mediaURL = mobj.group(1).replace('\\/', '/')
1474 video_extension = mediaURL[-3:]
1475 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1477 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1479 self._downloader.trouble(u'ERROR: unable to extract title')
1481 video_title = mobj.group(1).decode('utf-8')
1482 video_title = sanitize_title(video_title)
1484 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1486 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1488 video_uploader = mobj.group(1)
1491 # Process video information
1492 self._downloader.process_info({
1493 'id': video_id.decode('utf-8'),
1494 'url': video_url.decode('utf-8'),
1495 'uploader': video_uploader.decode('utf-8'),
1496 'upload_date': u'NA',
1497 'title': video_title,
1498 'stitle': simple_title,
1499 'ext': video_extension.decode('utf-8'),
1503 except UnavailableVideoError:
1504 self._downloader.trouble(u'\nERROR: unable to download video')
1507 class DailymotionIE(InfoExtractor):
1508 """Information Extractor for Dailymotion"""
1510 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1511 IE_NAME = u'dailymotion'
1513 def __init__(self, downloader=None):
1514 InfoExtractor.__init__(self, downloader)
1516 def report_download_webpage(self, video_id):
1517 """Report webpage download."""
1518 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1520 def report_extraction(self, video_id):
1521 """Report information extraction."""
1522 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1524 def _real_initialize(self):
1527 def _real_extract(self, url):
1528 # Extract id and simplified title from URL
1529 mobj = re.match(self._VALID_URL, url)
1531 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1534 # At this point we have a new video
1535 self._downloader.increment_downloads()
1536 video_id = mobj.group(1)
1538 simple_title = mobj.group(2).decode('utf-8')
1539 video_extension = 'flv'
1541 # Retrieve video webpage to extract further information
1542 request = urllib2.Request(url)
1543 request.add_header('Cookie', 'family_filter=off')
1545 self.report_download_webpage(video_id)
1546 webpage = urllib2.urlopen(request).read()
1547 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1548 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1551 # Extract URL, uploader and title from webpage
1552 self.report_extraction(video_id)
1553 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1555 self._downloader.trouble(u'ERROR: unable to extract media URL')
1557 sequence = urllib.unquote(mobj.group(1))
1558 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1560 self._downloader.trouble(u'ERROR: unable to extract media URL')
1562 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1564 # if needed add http://www.dailymotion.com/ if relative URL
1566 video_url = mediaURL
1568 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1570 self._downloader.trouble(u'ERROR: unable to extract title')
1572 video_title = mobj.group(1).decode('utf-8')
1573 video_title = sanitize_title(video_title)
1575 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1577 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1579 video_uploader = mobj.group(1)
1582 # Process video information
1583 self._downloader.process_info({
1584 'id': video_id.decode('utf-8'),
1585 'url': video_url.decode('utf-8'),
1586 'uploader': video_uploader.decode('utf-8'),
1587 'upload_date': u'NA',
1588 'title': video_title,
1589 'stitle': simple_title,
1590 'ext': video_extension.decode('utf-8'),
1594 except UnavailableVideoError:
1595 self._downloader.trouble(u'\nERROR: unable to download video')
1598 class GoogleIE(InfoExtractor):
1599 """Information extractor for video.google.com."""
1601 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1602 IE_NAME = u'video.google'
1604 def __init__(self, downloader=None):
1605 InfoExtractor.__init__(self, downloader)
1607 def report_download_webpage(self, video_id):
1608 """Report webpage download."""
1609 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1611 def report_extraction(self, video_id):
1612 """Report information extraction."""
1613 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1615 def _real_initialize(self):
1618 def _real_extract(self, url):
1619 # Extract id from URL
1620 mobj = re.match(self._VALID_URL, url)
1622 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1625 # At this point we have a new video
1626 self._downloader.increment_downloads()
1627 video_id = mobj.group(1)
1629 video_extension = 'mp4'
1631 # Retrieve video webpage to extract further information
1632 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1634 self.report_download_webpage(video_id)
1635 webpage = urllib2.urlopen(request).read()
1636 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1637 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1640 # Extract URL, uploader, and title from webpage
1641 self.report_extraction(video_id)
1642 mobj = re.search(r"download_url:'([^']+)'", webpage)
1644 video_extension = 'flv'
1645 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1647 self._downloader.trouble(u'ERROR: unable to extract media URL')
1649 mediaURL = urllib.unquote(mobj.group(1))
1650 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1651 mediaURL = mediaURL.replace('\\x26', '\x26')
1653 video_url = mediaURL
1655 mobj = re.search(r'<title>(.*)</title>', webpage)
1657 self._downloader.trouble(u'ERROR: unable to extract title')
1659 video_title = mobj.group(1).decode('utf-8')
1660 video_title = sanitize_title(video_title)
1661 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1663 # Extract video description
1664 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1666 self._downloader.trouble(u'ERROR: unable to extract video description')
1668 video_description = mobj.group(1).decode('utf-8')
1669 if not video_description:
1670 video_description = 'No description available.'
1672 # Extract video thumbnail
1673 if self._downloader.params.get('forcethumbnail', False):
1674 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1676 webpage = urllib2.urlopen(request).read()
1677 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1678 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1680 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1682 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1684 video_thumbnail = mobj.group(1)
1685 else: # we need something to pass to process_info
1686 video_thumbnail = ''
1689 # Process video information
1690 self._downloader.process_info({
1691 'id': video_id.decode('utf-8'),
1692 'url': video_url.decode('utf-8'),
1694 'upload_date': u'NA',
1695 'title': video_title,
1696 'stitle': simple_title,
1697 'ext': video_extension.decode('utf-8'),
1701 except UnavailableVideoError:
1702 self._downloader.trouble(u'\nERROR: unable to download video')
1705 class PhotobucketIE(InfoExtractor):
1706 """Information extractor for photobucket.com."""
1708 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1709 IE_NAME = u'photobucket'
1711 def __init__(self, downloader=None):
1712 InfoExtractor.__init__(self, downloader)
1714 def report_download_webpage(self, video_id):
1715 """Report webpage download."""
1716 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1718 def report_extraction(self, video_id):
1719 """Report information extraction."""
1720 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1722 def _real_initialize(self):
1725 def _real_extract(self, url):
1726 # Extract id from URL
1727 mobj = re.match(self._VALID_URL, url)
1729 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1732 # At this point we have a new video
1733 self._downloader.increment_downloads()
1734 video_id = mobj.group(1)
1736 video_extension = 'flv'
1738 # Retrieve video webpage to extract further information
1739 request = urllib2.Request(url)
1741 self.report_download_webpage(video_id)
1742 webpage = urllib2.urlopen(request).read()
1743 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1744 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1747 # Extract URL, uploader, and title from webpage
1748 self.report_extraction(video_id)
1749 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1751 self._downloader.trouble(u'ERROR: unable to extract media URL')
1753 mediaURL = urllib.unquote(mobj.group(1))
1755 video_url = mediaURL
1757 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1759 self._downloader.trouble(u'ERROR: unable to extract title')
1761 video_title = mobj.group(1).decode('utf-8')
1762 video_title = sanitize_title(video_title)
1763 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1765 video_uploader = mobj.group(2).decode('utf-8')
1768 # Process video information
1769 self._downloader.process_info({
1770 'id': video_id.decode('utf-8'),
1771 'url': video_url.decode('utf-8'),
1772 'uploader': video_uploader,
1773 'upload_date': u'NA',
1774 'title': video_title,
1775 'stitle': simple_title,
1776 'ext': video_extension.decode('utf-8'),
1780 except UnavailableVideoError:
1781 self._downloader.trouble(u'\nERROR: unable to download video')
1784 class YahooIE(InfoExtractor):
1785 """Information extractor for video.yahoo.com."""
1787 # _VALID_URL matches all Yahoo! Video URLs
1788 # _VPAGE_URL matches only the extractable '/watch/' URLs
1789 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1790 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1791 IE_NAME = u'video.yahoo'
1793 def __init__(self, downloader=None):
1794 InfoExtractor.__init__(self, downloader)
1796 def report_download_webpage(self, video_id):
1797 """Report webpage download."""
1798 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1800 def report_extraction(self, video_id):
1801 """Report information extraction."""
1802 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1804 def _real_initialize(self):
1807 def _real_extract(self, url, new_video=True):
1808 # Extract ID from URL
1809 mobj = re.match(self._VALID_URL, url)
1811 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1814 # At this point we have a new video
1815 self._downloader.increment_downloads()
1816 video_id = mobj.group(2)
1817 video_extension = 'flv'
1819 # Rewrite valid but non-extractable URLs as
1820 # extractable English language /watch/ URLs
1821 if re.match(self._VPAGE_URL, url) is None:
1822 request = urllib2.Request(url)
1824 webpage = urllib2.urlopen(request).read()
1825 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1826 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1829 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1831 self._downloader.trouble(u'ERROR: Unable to extract id field')
1833 yahoo_id = mobj.group(1)
1835 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1837 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1839 yahoo_vid = mobj.group(1)
1841 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1842 return self._real_extract(url, new_video=False)
1844 # Retrieve video webpage to extract further information
1845 request = urllib2.Request(url)
1847 self.report_download_webpage(video_id)
1848 webpage = urllib2.urlopen(request).read()
1849 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1850 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1853 # Extract uploader and title from webpage
1854 self.report_extraction(video_id)
1855 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1857 self._downloader.trouble(u'ERROR: unable to extract video title')
1859 video_title = mobj.group(1).decode('utf-8')
1860 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1862 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1864 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1866 video_uploader = mobj.group(1).decode('utf-8')
1868 # Extract video thumbnail
1869 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1871 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1873 video_thumbnail = mobj.group(1).decode('utf-8')
1875 # Extract video description
1876 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1878 self._downloader.trouble(u'ERROR: unable to extract video description')
1880 video_description = mobj.group(1).decode('utf-8')
1881 if not video_description:
1882 video_description = 'No description available.'
1884 # Extract video height and width
1885 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1887 self._downloader.trouble(u'ERROR: unable to extract video height')
1889 yv_video_height = mobj.group(1)
1891 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1893 self._downloader.trouble(u'ERROR: unable to extract video width')
1895 yv_video_width = mobj.group(1)
1897 # Retrieve video playlist to extract media URL
1898 # I'm not completely sure what all these options are, but we
1899 # seem to need most of them, otherwise the server sends a 401.
1900 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1901 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1902 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1903 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1904 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1906 self.report_download_webpage(video_id)
1907 webpage = urllib2.urlopen(request).read()
1908 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1909 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1912 # Extract media URL from playlist XML
1913 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1915 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1917 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1918 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1921 # Process video information
1922 self._downloader.process_info({
1923 'id': video_id.decode('utf-8'),
1925 'uploader': video_uploader,
1926 'upload_date': u'NA',
1927 'title': video_title,
1928 'stitle': simple_title,
1929 'ext': video_extension.decode('utf-8'),
1930 'thumbnail': video_thumbnail.decode('utf-8'),
1931 'description': video_description,
1932 'thumbnail': video_thumbnail,
1935 except UnavailableVideoError:
1936 self._downloader.trouble(u'\nERROR: unable to download video')
1939 class VimeoIE(InfoExtractor):
1940 """Information extractor for vimeo.com."""
1942 # _VALID_URL matches Vimeo URLs
1943 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1946 def __init__(self, downloader=None):
1947 InfoExtractor.__init__(self, downloader)
1949 def report_download_webpage(self, video_id):
1950 """Report webpage download."""
1951 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1953 def report_extraction(self, video_id):
1954 """Report information extraction."""
1955 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1957 def _real_initialize(self):
1960 def _real_extract(self, url, new_video=True):
1961 # Extract ID from URL
1962 mobj = re.match(self._VALID_URL, url)
1964 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1967 # At this point we have a new video
1968 self._downloader.increment_downloads()
1969 video_id = mobj.group(1)
1971 # Retrieve video webpage to extract further information
1972 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1974 self.report_download_webpage(video_id)
1975 webpage = urllib2.urlopen(request).read()
1976 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1977 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1980 # Now we begin extracting as much information as we can from what we
1981 # retrieved. First we extract the information common to all extractors,
1982 # and latter we extract those that are Vimeo specific.
1983 self.report_extraction(video_id)
1986 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1988 self._downloader.trouble(u'ERROR: unable to extract video title')
1990 video_title = mobj.group(1).decode('utf-8')
1991 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1994 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1996 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1998 video_uploader = mobj.group(1).decode('utf-8')
2000 # Extract video thumbnail
2001 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2003 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2005 video_thumbnail = mobj.group(1).decode('utf-8')
2007 # # Extract video description
2008 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2010 # self._downloader.trouble(u'ERROR: unable to extract video description')
2012 # video_description = mobj.group(1).decode('utf-8')
2013 # if not video_description: video_description = 'No description available.'
2014 video_description = 'Foo.'
2016 # Vimeo specific: extract request signature
2017 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2019 self._downloader.trouble(u'ERROR: unable to extract request signature')
2021 sig = mobj.group(1).decode('utf-8')
2023 # Vimeo specific: Extract request signature expiration
2024 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2026 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2028 sig_exp = mobj.group(1).decode('utf-8')
2030 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2033 # Process video information
2034 self._downloader.process_info({
2035 'id': video_id.decode('utf-8'),
2037 'uploader': video_uploader,
2038 'upload_date': u'NA',
2039 'title': video_title,
2040 'stitle': simple_title,
2042 'thumbnail': video_thumbnail.decode('utf-8'),
2043 'description': video_description,
2044 'thumbnail': video_thumbnail,
2045 'description': video_description,
2048 except UnavailableVideoError:
2049 self._downloader.trouble(u'ERROR: unable to download video')
2052 class GenericIE(InfoExtractor):
2053 """Generic last-resort information extractor."""
2056 IE_NAME = u'generic'
2058 def __init__(self, downloader=None):
2059 InfoExtractor.__init__(self, downloader)
2061 def report_download_webpage(self, video_id):
2062 """Report webpage download."""
2063 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2064 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2066 def report_extraction(self, video_id):
2067 """Report information extraction."""
2068 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2070 def _real_initialize(self):
2073 def _real_extract(self, url):
2074 # At this point we have a new video
2075 self._downloader.increment_downloads()
2077 video_id = url.split('/')[-1]
2078 request = urllib2.Request(url)
2080 self.report_download_webpage(video_id)
2081 webpage = urllib2.urlopen(request).read()
2082 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2083 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2085 except ValueError, err:
2086 # since this is the last-resort InfoExtractor, if
2087 # this error is thrown, it'll be thrown here
2088 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2091 self.report_extraction(video_id)
2092 # Start with something easy: JW Player in SWFObject
2093 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2095 # Broaden the search a little bit
2096 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2098 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2101 # It's possible that one of the regexes
2102 # matched, but returned an empty group:
2103 if mobj.group(1) is None:
2104 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2107 video_url = urllib.unquote(mobj.group(1))
2108 video_id = os.path.basename(video_url)
2110 # here's a fun little line of code for you:
2111 video_extension = os.path.splitext(video_id)[1][1:]
2112 video_id = os.path.splitext(video_id)[0]
2114 # it's tempting to parse this further, but you would
2115 # have to take into account all the variations like
2116 # Video Title - Site Name
2117 # Site Name | Video Title
2118 # Video Title - Tagline | Site Name
2119 # and so on and so forth; it's just not practical
2120 mobj = re.search(r'<title>(.*)</title>', webpage)
2122 self._downloader.trouble(u'ERROR: unable to extract title')
2124 video_title = mobj.group(1).decode('utf-8')
2125 video_title = sanitize_title(video_title)
2126 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2128 # video uploader is domain name
2129 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2131 self._downloader.trouble(u'ERROR: unable to extract title')
2133 video_uploader = mobj.group(1).decode('utf-8')
2136 # Process video information
2137 self._downloader.process_info({
2138 'id': video_id.decode('utf-8'),
2139 'url': video_url.decode('utf-8'),
2140 'uploader': video_uploader,
2141 'upload_date': u'NA',
2142 'title': video_title,
2143 'stitle': simple_title,
2144 'ext': video_extension.decode('utf-8'),
2148 except UnavailableVideoError, err:
2149 self._downloader.trouble(u'\nERROR: unable to download video')
2152 class YoutubeSearchIE(InfoExtractor):
2153 """Information Extractor for YouTube search queries."""
2154 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2155 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2156 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2157 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2159 _max_youtube_results = 1000
2160 IE_NAME = u'youtube:search'
2162 def __init__(self, youtube_ie, downloader=None):
2163 InfoExtractor.__init__(self, downloader)
2164 self._youtube_ie = youtube_ie
2166 def report_download_page(self, query, pagenum):
2167 """Report attempt to download playlist page with given number."""
2168 query = query.decode(preferredencoding())
2169 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2171 def _real_initialize(self):
2172 self._youtube_ie.initialize()
2174 def _real_extract(self, query):
2175 mobj = re.match(self._VALID_URL, query)
2177 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2180 prefix, query = query.split(':')
2182 query = query.encode('utf-8')
2184 self._download_n_results(query, 1)
2186 elif prefix == 'all':
2187 self._download_n_results(query, self._max_youtube_results)
2193 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2195 elif n > self._max_youtube_results:
2196 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2197 n = self._max_youtube_results
2198 self._download_n_results(query, n)
2200 except ValueError: # parsing prefix as integer fails
2201 self._download_n_results(query, 1)
2204 def _download_n_results(self, query, n):
2205 """Downloads a specified number of results for a query"""
2208 already_seen = set()
2212 self.report_download_page(query, pagenum)
2213 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2214 request = urllib2.Request(result_url)
2216 page = urllib2.urlopen(request).read()
2217 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2218 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2221 # Extract video identifiers
2222 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2223 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2224 if video_id not in already_seen:
2225 video_ids.append(video_id)
2226 already_seen.add(video_id)
2227 if len(video_ids) == n:
2228 # Specified n videos reached
2229 for id in video_ids:
2230 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2233 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2234 for id in video_ids:
2235 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2238 pagenum = pagenum + 1
2241 class GoogleSearchIE(InfoExtractor):
2242 """Information Extractor for Google Video search queries."""
2243 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2244 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2245 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2246 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2248 _max_google_results = 1000
2249 IE_NAME = u'video.google:search'
2251 def __init__(self, google_ie, downloader=None):
2252 InfoExtractor.__init__(self, downloader)
2253 self._google_ie = google_ie
2255 def report_download_page(self, query, pagenum):
2256 """Report attempt to download playlist page with given number."""
2257 query = query.decode(preferredencoding())
2258 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2260 def _real_initialize(self):
2261 self._google_ie.initialize()
2263 def _real_extract(self, query):
2264 mobj = re.match(self._VALID_URL, query)
2266 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2269 prefix, query = query.split(':')
2271 query = query.encode('utf-8')
2273 self._download_n_results(query, 1)
2275 elif prefix == 'all':
2276 self._download_n_results(query, self._max_google_results)
2282 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2284 elif n > self._max_google_results:
2285 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2286 n = self._max_google_results
2287 self._download_n_results(query, n)
2289 except ValueError: # parsing prefix as integer fails
2290 self._download_n_results(query, 1)
2293 def _download_n_results(self, query, n):
2294 """Downloads a specified number of results for a query"""
2297 already_seen = set()
2301 self.report_download_page(query, pagenum)
2302 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2303 request = urllib2.Request(result_url)
2305 page = urllib2.urlopen(request).read()
2306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2307 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2310 # Extract video identifiers
2311 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2312 video_id = mobj.group(1)
2313 if video_id not in already_seen:
2314 video_ids.append(video_id)
2315 already_seen.add(video_id)
2316 if len(video_ids) == n:
2317 # Specified n videos reached
2318 for id in video_ids:
2319 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2322 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2323 for id in video_ids:
2324 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2327 pagenum = pagenum + 1
2330 class YahooSearchIE(InfoExtractor):
2331 """Information Extractor for Yahoo! Video search queries."""
2332 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2333 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2334 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2335 _MORE_PAGES_INDICATOR = r'\s*Next'
2337 _max_yahoo_results = 1000
2338 IE_NAME = u'video.yahoo:search'
2340 def __init__(self, yahoo_ie, downloader=None):
2341 InfoExtractor.__init__(self, downloader)
2342 self._yahoo_ie = yahoo_ie
2344 def report_download_page(self, query, pagenum):
2345 """Report attempt to download playlist page with given number."""
2346 query = query.decode(preferredencoding())
2347 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2349 def _real_initialize(self):
2350 self._yahoo_ie.initialize()
2352 def _real_extract(self, query):
2353 mobj = re.match(self._VALID_URL, query)
2355 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2358 prefix, query = query.split(':')
2360 query = query.encode('utf-8')
2362 self._download_n_results(query, 1)
2364 elif prefix == 'all':
2365 self._download_n_results(query, self._max_yahoo_results)
2371 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2373 elif n > self._max_yahoo_results:
2374 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2375 n = self._max_yahoo_results
2376 self._download_n_results(query, n)
2378 except ValueError: # parsing prefix as integer fails
2379 self._download_n_results(query, 1)
2382 def _download_n_results(self, query, n):
2383 """Downloads a specified number of results for a query"""
2386 already_seen = set()
2390 self.report_download_page(query, pagenum)
2391 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2392 request = urllib2.Request(result_url)
2394 page = urllib2.urlopen(request).read()
2395 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2396 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2399 # Extract video identifiers
2400 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2401 video_id = mobj.group(1)
2402 if video_id not in already_seen:
2403 video_ids.append(video_id)
2404 already_seen.add(video_id)
2405 if len(video_ids) == n:
2406 # Specified n videos reached
2407 for id in video_ids:
2408 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2411 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2412 for id in video_ids:
2413 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2416 pagenum = pagenum + 1
2419 class YoutubePlaylistIE(InfoExtractor):
2420 """Information Extractor for YouTube playlists."""
2422 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2423 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2424 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2425 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2427 IE_NAME = u'youtube:playlist'
2429 def __init__(self, youtube_ie, downloader=None):
2430 InfoExtractor.__init__(self, downloader)
2431 self._youtube_ie = youtube_ie
2433 def report_download_page(self, playlist_id, pagenum):
2434 """Report attempt to download playlist page with given number."""
2435 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2437 def _real_initialize(self):
2438 self._youtube_ie.initialize()
2440 def _real_extract(self, url):
2441 # Extract playlist id
2442 mobj = re.match(self._VALID_URL, url)
2444 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2448 if mobj.group(3) is not None:
2449 self._youtube_ie.extract(mobj.group(3))
2452 # Download playlist pages
2453 # prefix is 'p' as default for playlists but there are other types that need extra care
2454 playlist_prefix = mobj.group(1)
2455 if playlist_prefix == 'a':
2456 playlist_access = 'artist'
2458 playlist_prefix = 'p'
2459 playlist_access = 'view_play_list'
2460 playlist_id = mobj.group(2)
2465 self.report_download_page(playlist_id, pagenum)
2466 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2468 page = urllib2.urlopen(request).read()
2469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2470 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2473 # Extract video identifiers
2475 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2476 if mobj.group(1) not in ids_in_page:
2477 ids_in_page.append(mobj.group(1))
2478 video_ids.extend(ids_in_page)
2480 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2482 pagenum = pagenum + 1
2484 playliststart = self._downloader.params.get('playliststart', 1) - 1
2485 playlistend = self._downloader.params.get('playlistend', -1)
2486 video_ids = video_ids[playliststart:playlistend]
2488 for id in video_ids:
2489 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2493 class YoutubeUserIE(InfoExtractor):
2494 """Information Extractor for YouTube users."""
2496 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2497 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2498 _GDATA_PAGE_SIZE = 50
2499 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2500 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2502 IE_NAME = u'youtube:user'
2504 def __init__(self, youtube_ie, downloader=None):
2505 InfoExtractor.__init__(self, downloader)
2506 self._youtube_ie = youtube_ie
2508 def report_download_page(self, username, start_index):
2509 """Report attempt to download user page."""
2510 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2511 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2513 def _real_initialize(self):
2514 self._youtube_ie.initialize()
2516 def _real_extract(self, url):
2518 mobj = re.match(self._VALID_URL, url)
2520 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2523 username = mobj.group(1)
2525 # Download video ids using YouTube Data API. Result size per
2526 # query is limited (currently to 50 videos) so we need to query
2527 # page by page until there are no video ids - it means we got
2534 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2535 self.report_download_page(username, start_index)
2537 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2540 page = urllib2.urlopen(request).read()
2541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2542 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2545 # Extract video identifiers
2548 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2549 if mobj.group(1) not in ids_in_page:
2550 ids_in_page.append(mobj.group(1))
2552 video_ids.extend(ids_in_page)
2554 # A little optimization - if current page is not
2555 # "full", ie. does not contain PAGE_SIZE video ids then
2556 # we can assume that this page is the last one - there
2557 # are no more ids on further pages - no need to query
2560 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2565 all_ids_count = len(video_ids)
2566 playliststart = self._downloader.params.get('playliststart', 1) - 1
2567 playlistend = self._downloader.params.get('playlistend', -1)
2569 if playlistend == -1:
2570 video_ids = video_ids[playliststart:]
2572 video_ids = video_ids[playliststart:playlistend]
2574 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2575 (username, all_ids_count, len(video_ids)))
2577 for video_id in video_ids:
2578 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2581 class DepositFilesIE(InfoExtractor):
2582 """Information extractor for depositfiles.com"""
2584 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2585 IE_NAME = u'DepositFiles'
2587 def __init__(self, downloader=None):
2588 InfoExtractor.__init__(self, downloader)
2590 def report_download_webpage(self, file_id):
2591 """Report webpage download."""
2592 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2594 def report_extraction(self, file_id):
2595 """Report information extraction."""
2596 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2598 def _real_initialize(self):
2601 def _real_extract(self, url):
2602 # At this point we have a new file
2603 self._downloader.increment_downloads()
2605 file_id = url.split('/')[-1]
2606 # Rebuild url in english locale
2607 url = 'http://depositfiles.com/en/files/' + file_id
2609 # Retrieve file webpage with 'Free download' button pressed
2610 free_download_indication = { 'gateway_result' : '1' }
2611 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2613 self.report_download_webpage(file_id)
2614 webpage = urllib2.urlopen(request).read()
2615 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2616 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2619 # Search for the real file URL
2620 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2621 if (mobj is None) or (mobj.group(1) is None):
2622 # Try to figure out reason of the error.
2623 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2624 if (mobj is not None) and (mobj.group(1) is not None):
2625 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2626 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2628 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2631 file_url = mobj.group(1)
2632 file_extension = os.path.splitext(file_url)[1][1:]
2634 # Search for file title
2635 mobj = re.search(r'<b title="(.*?)">', webpage)
2637 self._downloader.trouble(u'ERROR: unable to extract title')
2639 file_title = mobj.group(1).decode('utf-8')
2642 # Process file information
2643 self._downloader.process_info({
2644 'id': file_id.decode('utf-8'),
2645 'url': file_url.decode('utf-8'),
2647 'upload_date': u'NA',
2648 'title': file_title,
2649 'stitle': file_title,
2650 'ext': file_extension.decode('utf-8'),
2654 except UnavailableVideoError, err:
2655 self._downloader.trouble(u'ERROR: unable to download file')
2658 class FacebookIE(InfoExtractor):
2659 """Information Extractor for Facebook"""
2661 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2662 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2663 _NETRC_MACHINE = 'facebook'
2664 _available_formats = ['highqual', 'lowqual']
2665 _video_extensions = {
2669 IE_NAME = u'facebook'
2671 def __init__(self, downloader=None):
2672 InfoExtractor.__init__(self, downloader)
2674 def _reporter(self, message):
2675 """Add header and report message."""
2676 self._downloader.to_screen(u'[facebook] %s' % message)
2678 def report_login(self):
2679 """Report attempt to log in."""
2680 self._reporter(u'Logging in')
2682 def report_video_webpage_download(self, video_id):
2683 """Report attempt to download video webpage."""
2684 self._reporter(u'%s: Downloading video webpage' % video_id)
2686 def report_information_extraction(self, video_id):
2687 """Report attempt to extract video information."""
2688 self._reporter(u'%s: Extracting video information' % video_id)
2690 def _parse_page(self, video_webpage):
2691 """Extract video information from page"""
2693 data = {'title': r'class="video_title datawrap">(.*?)</',
2694 'description': r'<div class="datawrap">(.*?)</div>',
2695 'owner': r'\("video_owner_name", "(.*?)"\)',
2696 'upload_date': r'data-date="(.*?)"',
2697 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2700 for piece in data.keys():
2701 mobj = re.search(data[piece], video_webpage)
2702 if mobj is not None:
2703 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2707 for fmt in self._available_formats:
2708 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2709 if mobj is not None:
2710 # URL is in a Javascript segment inside an escaped Unicode format within
2711 # the generally utf-8 page
2712 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2713 video_info['video_urls'] = video_urls
2717 def _real_initialize(self):
2718 if self._downloader is None:
2723 downloader_params = self._downloader.params
2725 # Attempt to use provided username and password or .netrc data
2726 if downloader_params.get('username', None) is not None:
2727 useremail = downloader_params['username']
2728 password = downloader_params['password']
2729 elif downloader_params.get('usenetrc', False):
2731 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2732 if info is not None:
2736 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2737 except (IOError, netrc.NetrcParseError), err:
2738 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2741 if useremail is None:
2750 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2753 login_results = urllib2.urlopen(request).read()
2754 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2755 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2757 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2758 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2761 def _real_extract(self, url):
2762 mobj = re.match(self._VALID_URL, url)
2764 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2766 video_id = mobj.group('ID')
2769 self.report_video_webpage_download(video_id)
2770 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2772 page = urllib2.urlopen(request)
2773 video_webpage = page.read()
2774 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2775 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2778 # Start extracting information
2779 self.report_information_extraction(video_id)
2781 # Extract information
2782 video_info = self._parse_page(video_webpage)
2785 if 'owner' not in video_info:
2786 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2788 video_uploader = video_info['owner']
2791 if 'title' not in video_info:
2792 self._downloader.trouble(u'ERROR: unable to extract video title')
2794 video_title = video_info['title']
2795 video_title = video_title.decode('utf-8')
2796 video_title = sanitize_title(video_title)
2799 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2800 simple_title = simple_title.strip(ur'_')
2803 if 'thumbnail' not in video_info:
2804 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2805 video_thumbnail = ''
2807 video_thumbnail = video_info['thumbnail']
2811 if 'upload_date' in video_info:
2812 upload_time = video_info['upload_date']
2813 timetuple = email.utils.parsedate_tz(upload_time)
2814 if timetuple is not None:
2816 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2821 video_description = video_info.get('description', 'No description available.')
2823 url_map = video_info['video_urls']
2824 if len(url_map.keys()) > 0:
2825 # Decide which formats to download
2826 req_format = self._downloader.params.get('format', None)
2827 format_limit = self._downloader.params.get('format_limit', None)
2829 if format_limit is not None and format_limit in self._available_formats:
2830 format_list = self._available_formats[self._available_formats.index(format_limit):]
2832 format_list = self._available_formats
2833 existing_formats = [x for x in format_list if x in url_map]
2834 if len(existing_formats) == 0:
2835 self._downloader.trouble(u'ERROR: no known formats available for video')
2837 if req_format is None:
2838 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2839 elif req_format == 'worst':
2840 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2841 elif req_format == '-1':
2842 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2845 if req_format not in url_map:
2846 self._downloader.trouble(u'ERROR: requested format not available')
2848 video_url_list = [(req_format, url_map[req_format])] # Specific format
2850 for format_param, video_real_url in video_url_list:
2852 # At this point we have a new video
2853 self._downloader.increment_downloads()
2856 video_extension = self._video_extensions.get(format_param, 'mp4')
2859 # Process video information
2860 self._downloader.process_info({
2861 'id': video_id.decode('utf-8'),
2862 'url': video_real_url.decode('utf-8'),
2863 'uploader': video_uploader.decode('utf-8'),
2864 'upload_date': upload_date,
2865 'title': video_title,
2866 'stitle': simple_title,
2867 'ext': video_extension.decode('utf-8'),
2868 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2869 'thumbnail': video_thumbnail.decode('utf-8'),
2870 'description': video_description.decode('utf-8'),
2873 except UnavailableVideoError, err:
2874 self._downloader.trouble(u'\nERROR: unable to download video')
2876 class BlipTVIE(InfoExtractor):
2877 """Information extractor for blip.tv"""
2879 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2880 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2881 IE_NAME = u'blip.tv'
2883 def report_extraction(self, file_id):
2884 """Report information extraction."""
2885 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2887 def _simplify_title(self, title):
2888 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2889 res = res.strip(ur'_')
2892 def _real_extract(self, url):
2893 mobj = re.match(self._VALID_URL, url)
2895 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2902 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2903 request = urllib2.Request(json_url)
2904 self.report_extraction(mobj.group(1))
2906 json_code = urllib2.urlopen(request).read()
2907 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2908 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2911 json_data = json.loads(json_code)
2912 if 'Post' in json_data:
2913 data = json_data['Post']
2917 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2918 video_url = data['media']['url']
2919 umobj = re.match(self._URL_EXT, video_url)
2921 raise ValueError('Can not determine filename extension')
2922 ext = umobj.group(1)
2924 self._downloader.increment_downloads()
2927 'id': data['item_id'],
2929 'uploader': data['display_name'],
2930 'upload_date': upload_date,
2931 'title': data['title'],
2932 'stitle': self._simplify_title(data['title']),
2934 'format': data['media']['mimeType'],
2935 'thumbnail': data['thumbnailUrl'],
2936 'description': data['description'],
2937 'player_url': data['embedUrl']
2939 except (ValueError,KeyError), err:
2940 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2944 self._downloader.process_info(info)
2945 except UnavailableVideoError, err:
2946 self._downloader.trouble(u'\nERROR: unable to download video')
2949 class MyVideoIE(InfoExtractor):
2950 """Information Extractor for myvideo.de."""
2952 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2953 IE_NAME = u'myvideo'
2955 def __init__(self, downloader=None):
2956 InfoExtractor.__init__(self, downloader)
2958 def report_download_webpage(self, video_id):
2959 """Report webpage download."""
2960 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2962 def report_extraction(self, video_id):
2963 """Report information extraction."""
2964 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2966 def _real_initialize(self):
2969 def _real_extract(self,url):
2970 mobj = re.match(self._VALID_URL, url)
2972 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2975 video_id = mobj.group(1)
2976 simple_title = mobj.group(2).decode('utf-8')
2977 # should actually not be necessary
2978 simple_title = sanitize_title(simple_title)
2979 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2982 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2984 self.report_download_webpage(video_id)
2985 webpage = urllib2.urlopen(request).read()
2986 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2987 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2990 self.report_extraction(video_id)
2991 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2994 self._downloader.trouble(u'ERROR: unable to extract media URL')
2996 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2998 mobj = re.search('<title>([^<]+)</title>', webpage)
3000 self._downloader.trouble(u'ERROR: unable to extract title')
3003 video_title = mobj.group(1)
3004 video_title = sanitize_title(video_title)
3008 self._downloader.process_info({
3012 'upload_date': u'NA',
3013 'title': video_title,
3014 'stitle': simple_title,
3019 except UnavailableVideoError:
3020 self._downloader.trouble(u'\nERROR: Unable to download video')
3022 class ComedyCentralIE(InfoExtractor):
3023 """Information extractor for The Daily Show and Colbert Report """
3025 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3026 IE_NAME = u'comedycentral'
3028 def report_extraction(self, episode_id):
3029 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3031 def report_config_download(self, episode_id):
3032 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3034 def report_index_download(self, episode_id):
3035 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3037 def report_player_url(self, episode_id):
3038 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3040 def _simplify_title(self, title):
3041 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3042 res = res.strip(ur'_')
3045 def _real_extract(self, url):
3046 mobj = re.match(self._VALID_URL, url)
3048 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3051 if mobj.group('shortname'):
3052 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3053 url = 'http://www.thedailyshow.com/full-episodes/'
3055 url = 'http://www.colbertnation.com/full-episodes/'
3056 mobj = re.match(self._VALID_URL, url)
3057 assert mobj is not None
3059 dlNewest = not mobj.group('episode')
3061 epTitle = mobj.group('showname')
3063 epTitle = mobj.group('episode')
3065 req = urllib2.Request(url)
3066 self.report_extraction(epTitle)
3068 htmlHandle = urllib2.urlopen(req)
3069 html = htmlHandle.read()
3070 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3071 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3074 url = htmlHandle.geturl()
3075 mobj = re.match(self._VALID_URL, url)
3077 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3079 if mobj.group('episode') == '':
3080 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3082 epTitle = mobj.group('episode')
3084 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3085 if len(mMovieParams) == 0:
3086 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3089 playerUrl_raw = mMovieParams[0][0]
3090 self.report_player_url(epTitle)
3092 urlHandle = urllib2.urlopen(playerUrl_raw)
3093 playerUrl = urlHandle.geturl()
3094 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3095 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3098 uri = mMovieParams[0][1]
3099 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3100 self.report_index_download(epTitle)
3102 indexXml = urllib2.urlopen(indexUrl).read()
3103 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3104 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3107 idoc = xml.etree.ElementTree.fromstring(indexXml)
3108 itemEls = idoc.findall('.//item')
3109 for itemEl in itemEls:
3110 mediaId = itemEl.findall('./guid')[0].text
3111 shortMediaId = mediaId.split(':')[-1]
3112 showId = mediaId.split(':')[-2].replace('.com', '')
3113 officialTitle = itemEl.findall('./title')[0].text
3114 officialDate = itemEl.findall('./pubDate')[0].text
3116 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3117 urllib.urlencode({'uri': mediaId}))
3118 configReq = urllib2.Request(configUrl)
3119 self.report_config_download(epTitle)
3121 configXml = urllib2.urlopen(configReq).read()
3122 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3123 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3126 cdoc = xml.etree.ElementTree.fromstring(configXml)
3128 for rendition in cdoc.findall('.//rendition'):
3129 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3133 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3136 # For now, just pick the highest bitrate
3137 format,video_url = turls[-1]
3139 self._downloader.increment_downloads()
3141 effTitle = showId + '-' + epTitle
3146 'upload_date': officialDate,
3148 'stitle': self._simplify_title(effTitle),
3152 'description': officialTitle,
3153 'player_url': playerUrl
3157 self._downloader.process_info(info)
3158 except UnavailableVideoError, err:
3159 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3163 class EscapistIE(InfoExtractor):
3164 """Information extractor for The Escapist """
3166 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3167 IE_NAME = u'escapist'
3169 def report_extraction(self, showName):
3170 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3172 def report_config_download(self, showName):
3173 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3175 def _simplify_title(self, title):
3176 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3177 res = res.strip(ur'_')
3180 def _real_extract(self, url):
3181 htmlParser = HTMLParser.HTMLParser()
3183 mobj = re.match(self._VALID_URL, url)
3185 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3187 showName = mobj.group('showname')
3188 videoId = mobj.group('episode')
3190 self.report_extraction(showName)
3192 webPage = urllib2.urlopen(url).read()
3193 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3194 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3197 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3198 description = htmlParser.unescape(descMatch.group(1))
3199 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3200 imgUrl = htmlParser.unescape(imgMatch.group(1))
3201 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3202 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3203 configUrlMatch = re.search('config=(.*)$', playerUrl)
3204 configUrl = urllib2.unquote(configUrlMatch.group(1))
3206 self.report_config_download(showName)
3208 configJSON = urllib2.urlopen(configUrl).read()
3209 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3210 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3213 # Technically, it's JavaScript, not JSON
3214 configJSON = configJSON.replace("'", '"')
3217 config = json.loads(configJSON)
3218 except (ValueError,), err:
3219 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3222 playlist = config['playlist']
3223 videoUrl = playlist[1]['url']
3225 self._downloader.increment_downloads()
3229 'uploader': showName,
3230 'upload_date': None,
3232 'stitle': self._simplify_title(showName),
3235 'thumbnail': imgUrl,
3236 'description': description,
3237 'player_url': playerUrl,
3241 self._downloader.process_info(info)
3242 except UnavailableVideoError, err:
3243 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3247 class PostProcessor(object):
3248 """Post Processor class.
3250 PostProcessor objects can be added to downloaders with their
3251 add_post_processor() method. When the downloader has finished a
3252 successful download, it will take its internal chain of PostProcessors
3253 and start calling the run() method on each one of them, first with
3254 an initial argument and then with the returned value of the previous
3257 The chain will be stopped if one of them ever returns None or the end
3258 of the chain is reached.
3260 PostProcessor objects follow a "mutual registration" process similar
3261 to InfoExtractor objects.
3266 def __init__(self, downloader=None):
3267 self._downloader = downloader
3269 def set_downloader(self, downloader):
3270 """Sets the downloader for this PP."""
3271 self._downloader = downloader
3273 def run(self, information):
3274 """Run the PostProcessor.
3276 The "information" argument is a dictionary like the ones
3277 composed by InfoExtractors. The only difference is that this
3278 one has an extra field called "filepath" that points to the
3281 When this method returns None, the postprocessing chain is
3282 stopped. However, this method may return an information
3283 dictionary that will be passed to the next postprocessing
3284 object in the chain. It can be the one it received after
3285 changing some fields.
3287 In addition, this method may raise a PostProcessingError
3288 exception that will be taken into account by the downloader
3291 return information # by default, do nothing
3294 class FFmpegExtractAudioPP(PostProcessor):
3296 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3297 PostProcessor.__init__(self, downloader)
3298 if preferredcodec is None:
3299 preferredcodec = 'best'
3300 self._preferredcodec = preferredcodec
3301 self._preferredquality = preferredquality
3302 self._keepvideo = keepvideo
3305 def get_audio_codec(path):
3307 cmd = ['ffprobe', '-show_streams', '--', path]
3308 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3309 output = handle.communicate()[0]
3310 if handle.wait() != 0:
3312 except (IOError, OSError):
3315 for line in output.split('\n'):
3316 if line.startswith('codec_name='):
3317 audio_codec = line.split('=')[1].strip()
3318 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3323 def run_ffmpeg(path, out_path, codec, more_opts):
3325 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3326 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3328 except (IOError, OSError):
3331 def run(self, information):
3332 path = information['filepath']
3334 filecodec = self.get_audio_codec(path)
3335 if filecodec is None:
3336 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3340 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3341 if filecodec == 'aac' or filecodec == 'mp3':
3342 # Lossless if possible
3344 extension = filecodec
3345 if filecodec == 'aac':
3346 more_opts = ['-f', 'adts']
3349 acodec = 'libmp3lame'
3352 if self._preferredquality is not None:
3353 more_opts += ['-ab', self._preferredquality]
3355 # We convert the audio (lossy)
3356 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3357 extension = self._preferredcodec
3359 if self._preferredquality is not None:
3360 more_opts += ['-ab', self._preferredquality]
3361 if self._preferredcodec == 'aac':
3362 more_opts += ['-f', 'adts']
3364 (prefix, ext) = os.path.splitext(path)
3365 new_path = prefix + '.' + extension
3366 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3367 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3370 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3373 # Try to update the date time for extracted audio file.
3374 if information.get('filetime') is not None:
3376 os.utime(new_path, (time.time(), information['filetime']))
3378 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3380 if not self._keepvideo:
3383 except (IOError, OSError):
3384 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3387 information['filepath'] = new_path
3391 def updateSelf(downloader, filename):
3392 ''' Update the program file with the latest version from the repository '''
3393 # Note: downloader only used for options
3394 if not os.access(filename, os.W_OK):
3395 sys.exit('ERROR: no write permissions on %s' % filename)
3397 downloader.to_screen('Updating to latest version...')
3401 urlh = urllib.urlopen(UPDATE_URL)
3402 newcontent = urlh.read()
3405 except (IOError, OSError), err:
3406 sys.exit('ERROR: unable to download latest version')
3409 outf = open(filename, 'wb')
3411 outf.write(newcontent)
3414 except (IOError, OSError), err:
3415 sys.exit('ERROR: unable to overwrite current version')
3417 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3424 def _format_option_string(option):
3425 ''' ('-o', '--option') -> -o, --format METAVAR'''
3429 if option._short_opts: opts.append(option._short_opts[0])
3430 if option._long_opts: opts.append(option._long_opts[0])
3431 if len(opts) > 1: opts.insert(1, ', ')
3433 if option.takes_value(): opts.append(' %s' % option.metavar)
3435 return "".join(opts)
3437 def _find_term_columns():
3438 columns = os.environ.get('COLUMNS', None)
3443 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3444 out,err = sp.communicate()
3445 return int(out.split()[1])
3451 max_help_position = 80
3453 # No need to wrap help messages if we're on a wide console
3454 columns = _find_term_columns()
3455 if columns: max_width = columns
3457 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3458 fmt.format_option_strings = _format_option_string
3461 'version' : __version__,
3463 'usage' : '%prog [options] url [url...]',
3464 'conflict_handler' : 'resolve',
3467 parser = optparse.OptionParser(**kw)
3470 general = optparse.OptionGroup(parser, 'General Options')
3471 selection = optparse.OptionGroup(parser, 'Video Selection')
3472 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3473 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3474 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3475 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3476 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3478 general.add_option('-h', '--help',
3479 action='help', help='print this help text and exit')
3480 general.add_option('-v', '--version',
3481 action='version', help='print program version and exit')
3482 general.add_option('-U', '--update',
3483 action='store_true', dest='update_self', help='update this program to latest version')
3484 general.add_option('-i', '--ignore-errors',
3485 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3486 general.add_option('-r', '--rate-limit',
3487 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3488 general.add_option('-R', '--retries',
3489 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3490 general.add_option('--dump-user-agent',
3491 action='store_true', dest='dump_user_agent',
3492 help='display the current browser identification', default=False)
3493 general.add_option('--list-extractors',
3494 action='store_true', dest='list_extractors',
3495 help='List all supported extractors and the URLs they would handle', default=False)
3497 selection.add_option('--playlist-start',
3498 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3499 selection.add_option('--playlist-end',
3500 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3501 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3502 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3504 authentication.add_option('-u', '--username',
3505 dest='username', metavar='USERNAME', help='account username')
3506 authentication.add_option('-p', '--password',
3507 dest='password', metavar='PASSWORD', help='account password')
3508 authentication.add_option('-n', '--netrc',
3509 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3512 video_format.add_option('-f', '--format',
3513 action='store', dest='format', metavar='FORMAT', help='video format code')
3514 video_format.add_option('--all-formats',
3515 action='store_const', dest='format', help='download all available video formats', const='-1')
3516 video_format.add_option('--max-quality',
3517 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3520 verbosity.add_option('-q', '--quiet',
3521 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3522 verbosity.add_option('-s', '--simulate',
3523 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3524 verbosity.add_option('--skip-download',
3525 action='store_true', dest='skip_download', help='do not download the video', default=False)
3526 verbosity.add_option('-g', '--get-url',
3527 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3528 verbosity.add_option('-e', '--get-title',
3529 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3530 verbosity.add_option('--get-thumbnail',
3531 action='store_true', dest='getthumbnail',
3532 help='simulate, quiet but print thumbnail URL', default=False)
3533 verbosity.add_option('--get-description',
3534 action='store_true', dest='getdescription',
3535 help='simulate, quiet but print video description', default=False)
3536 verbosity.add_option('--get-filename',
3537 action='store_true', dest='getfilename',
3538 help='simulate, quiet but print output filename', default=False)
3539 verbosity.add_option('--get-format',
3540 action='store_true', dest='getformat',
3541 help='simulate, quiet but print output format', default=False)
3542 verbosity.add_option('--no-progress',
3543 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3544 verbosity.add_option('--console-title',
3545 action='store_true', dest='consoletitle',
3546 help='display progress in console titlebar', default=False)
3549 filesystem.add_option('-t', '--title',
3550 action='store_true', dest='usetitle', help='use title in file name', default=False)
3551 filesystem.add_option('-l', '--literal',
3552 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3553 filesystem.add_option('-A', '--auto-number',
3554 action='store_true', dest='autonumber',
3555 help='number downloaded files starting from 00000', default=False)
3556 filesystem.add_option('-o', '--output',
3557 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3558 filesystem.add_option('-a', '--batch-file',
3559 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3560 filesystem.add_option('-w', '--no-overwrites',
3561 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3562 filesystem.add_option('-c', '--continue',
3563 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3564 filesystem.add_option('--cookies',
3565 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3566 filesystem.add_option('--no-part',
3567 action='store_true', dest='nopart', help='do not use .part files', default=False)
3568 filesystem.add_option('--no-mtime',
3569 action='store_false', dest='updatetime',
3570 help='do not use the Last-modified header to set the file modification time', default=True)
3571 filesystem.add_option('--write-description',
3572 action='store_true', dest='writedescription',
3573 help='write video description to a .description file', default=False)
3574 filesystem.add_option('--write-info-json',
3575 action='store_true', dest='writeinfojson',
3576 help='write video metadata to a .info.json file', default=False)
3579 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3580 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3581 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3582 help='"best", "aac" or "mp3"; best by default')
3583 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3584 help='ffmpeg audio bitrate specification, 128k by default')
3585 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3586 help='keeps the video file on disk after the post-processing; the video is erased by default')
3589 parser.add_option_group(general)
3590 parser.add_option_group(selection)
3591 parser.add_option_group(filesystem)
3592 parser.add_option_group(verbosity)
3593 parser.add_option_group(video_format)
3594 parser.add_option_group(authentication)
3595 parser.add_option_group(postproc)
3597 opts, args = parser.parse_args()
3599 return parser, opts, args
3601 def gen_extractors():
3602 """ Return a list of an instance of every supported extractor.
3603 The order does matter; the first extractor matched is the one handling the URL.
3605 youtube_ie = YoutubeIE()
3606 google_ie = GoogleIE()
3607 yahoo_ie = YahooIE()
3610 MetacafeIE(youtube_ie),
3612 YoutubePlaylistIE(youtube_ie),
3613 YoutubeUserIE(youtube_ie),
3614 YoutubeSearchIE(youtube_ie),
3616 GoogleSearchIE(google_ie),
3619 YahooSearchIE(yahoo_ie),
3632 parser, opts, args = parseOpts()
3634 # Open appropriate CookieJar
3635 if opts.cookiefile is None:
3636 jar = cookielib.CookieJar()
3639 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3640 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3642 except (IOError, OSError), err:
3643 sys.exit(u'ERROR: unable to open cookie file')
3646 if opts.dump_user_agent:
3647 print std_headers['User-Agent']
3650 # Batch file verification
3652 if opts.batchfile is not None:
3654 if opts.batchfile == '-':
3657 batchfd = open(opts.batchfile, 'r')
3658 batchurls = batchfd.readlines()
3659 batchurls = [x.strip() for x in batchurls]
3660 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3662 sys.exit(u'ERROR: batch file could not be read')
3663 all_urls = batchurls + args
3665 # General configuration
3666 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3667 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3668 urllib2.install_opener(opener)
3669 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3671 extractors = gen_extractors()
3673 if opts.list_extractors:
3674 for ie in extractors:
3676 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3677 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3678 for mu in matchedUrls:
3682 # Conflicting, missing and erroneous options
3683 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3684 parser.error(u'using .netrc conflicts with giving username/password')
3685 if opts.password is not None and opts.username is None:
3686 parser.error(u'account username missing')
3687 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3688 parser.error(u'using output template conflicts with using title, literal title or auto number')
3689 if opts.usetitle and opts.useliteral:
3690 parser.error(u'using title conflicts with using literal title')
3691 if opts.username is not None and opts.password is None:
3692 opts.password = getpass.getpass(u'Type account password and press return:')
3693 if opts.ratelimit is not None:
3694 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3695 if numeric_limit is None:
3696 parser.error(u'invalid rate limit specified')
3697 opts.ratelimit = numeric_limit
3698 if opts.retries is not None:
3700 opts.retries = long(opts.retries)
3701 except (TypeError, ValueError), err:
3702 parser.error(u'invalid retry count specified')
3704 opts.playliststart = int(opts.playliststart)
3705 if opts.playliststart <= 0:
3706 raise ValueError(u'Playlist start must be positive')
3707 except (TypeError, ValueError), err:
3708 parser.error(u'invalid playlist start number specified')
3710 opts.playlistend = int(opts.playlistend)
3711 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3712 raise ValueError(u'Playlist end must be greater than playlist start')
3713 except (TypeError, ValueError), err:
3714 parser.error(u'invalid playlist end number specified')
3715 if opts.extractaudio:
3716 if opts.audioformat not in ['best', 'aac', 'mp3']:
3717 parser.error(u'invalid audio format specified')
3720 fd = FileDownloader({
3721 'usenetrc': opts.usenetrc,
3722 'username': opts.username,
3723 'password': opts.password,
3724 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3725 'forceurl': opts.geturl,
3726 'forcetitle': opts.gettitle,
3727 'forcethumbnail': opts.getthumbnail,
3728 'forcedescription': opts.getdescription,
3729 'forcefilename': opts.getfilename,
3730 'forceformat': opts.getformat,
3731 'simulate': opts.simulate,
3732 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3733 'format': opts.format,
3734 'format_limit': opts.format_limit,
3735 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3736 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3737 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3738 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3739 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3740 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3741 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3742 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3743 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3744 or u'%(id)s.%(ext)s'),
3745 'ignoreerrors': opts.ignoreerrors,
3746 'ratelimit': opts.ratelimit,
3747 'nooverwrites': opts.nooverwrites,
3748 'retries': opts.retries,
3749 'continuedl': opts.continue_dl,
3750 'noprogress': opts.noprogress,
3751 'playliststart': opts.playliststart,
3752 'playlistend': opts.playlistend,
3753 'logtostderr': opts.outtmpl == '-',
3754 'consoletitle': opts.consoletitle,
3755 'nopart': opts.nopart,
3756 'updatetime': opts.updatetime,
3757 'writedescription': opts.writedescription,
3758 'writeinfojson': opts.writeinfojson,
3759 'matchtitle': opts.matchtitle,
3760 'rejecttitle': opts.rejecttitle,
3762 for extractor in extractors:
3763 fd.add_info_extractor(extractor)
3766 if opts.extractaudio:
3767 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3770 if opts.update_self:
3771 updateSelf(fd, sys.argv[0])
3774 if len(all_urls) < 1:
3775 if not opts.update_self:
3776 parser.error(u'you must provide at least one URL')
3779 retcode = fd.download(all_urls)
3781 # Dump cookie jar if requested
3782 if opts.cookiefile is not None:
3785 except (IOError, OSError), err:
3786 sys.exit(u'ERROR: unable to save cookie jar')
3791 if __name__ == '__main__':
3794 except DownloadError:
3796 except SameFileError:
3797 sys.exit(u'ERROR: fixed output name but more than one file to download')
3798 except KeyboardInterrupt:
3799 sys.exit(u'\nERROR: Interrupted by user')
3801 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: