Final implementation of the caching.
authorshort <>
Sun, 18 Sep 2005 06:44:00 +0000 (06:44 +0000)
committershort <>
Sun, 18 Sep 2005 06:44:00 +0000 (06:44 +0000)
Web.pm

diff --git a/Web.pm b/Web.pm
index c368a4b..386f769 100644 (file)
--- a/Web.pm
+++ b/Web.pm
@@ -110,28 +110,27 @@ my $have_Geo_IP; BEGIN { $have_Geo_IP=eval { require Geo::IP; 1; }; }
 # I do not know why.
 use POSIX qw(strftime);
 use Tie::Handle;
-use Apache2::Const qw(HTTP_MOVED_TEMPORARILY OK);
+use Apache2::Const qw(HTTP_MOVED_TEMPORARILY OK HTTP_OK);
 use URI;
 use URI::QueryParam;
 use Cwd;
 require HTTP::Date;
+require Storable;
+require Digest::MD5;
+require Data::Compare;
+use Data::Dumper;
+require Encode;
+use Apache2::Filter;
 
 
 #our $W;
-               # $W->{"title"}
-               # $W->{"head"}
-               # $W->{"force_charset"}
-               # $W->{"heading_done"}
-               # $W->{"footer_passed"}
-               # %{$W->{"headers"}}
-               # %{$W->{"headers_lc"}} # maps lc($headers_key)=>$headers_key
-               # %{$W->{"args"}}
 
 sub cleanup($)
 {
 my($apache_request)=@_;
 
        $packages_used_hash{$W->{"__PACKAGE__"}}{"_done"}=1;
+       cache_finish();
        # Sanity protection.
        $W=undef();
        return OK;
@@ -225,17 +224,46 @@ my($class,%args)=@_;
        do { $W->{"r"}->args(""); delete $ENV{"QUERY_STRING"}; } if $W->{"r"}->method() eq "POST";
        # Do not: $W->{"r"}->args()
        # as it parses only QUERY_STRING (not POST data).
-       $W->{"args"}={ CGI->new($W->{"r"})->Vars() };
+       $W->{"args_orig_array"}=[ CGI->new($W->{"r"})->Vars() ];
+       $W->{"args"}={ @{$W->{"args_orig_array"}} };
        for my $name (keys(%{$W->{"args"}})) {
                my @vals=split /\x00/,$W->{"args"}{$name};
                next if @vals<=1;
                $W->{"args"}{$name}=[@vals];
                }
 
-       do { $W->{$_}=$W->{"r"}->headers_in()->{"Accept"}         if !defined $W->{$_}; } for ("accept");
-       do { $W->{$_}=$W->{"r"}->headers_in()->{"User-Agent"}||"" if !defined $W->{$_}; } for ("user_agent");
+       $W->{"headers_in"}=$W->{"r"}->headers_in();
+       Wrequire 'My::Hash::Merge';
+       $W->{"headers_in"}=My::Hash::Merge->new(
+                       $W->{"headers_in"},
+                       My::Hash::Sub->new({
+                               "_get_remote_host"=>sub { return $W->{"r"}->get_remote_host(); },
+                               }),
+                       );
+       $W->{"headers_in"}=My::Hash::Readonly->new($W->{"headers_in"});
+       
+       if ($W->{"r"}->method() eq "GET" || $W->{"r"}->method() eq "HEAD") {
+               for (\$W->{"http_safe"}) {
+                       # Extend the current ETag system instead if you would need it:
+                       cluck "Explicitely NOT HTTP-Safe for method \"".$W->{"r"}->method()."\"?!?"
+                                       if defined($$_) && !$$_;
+                       $$_=1 if !defined $$_;
+                       }
+               }
+       else {
+               for (\$W->{"http_safe"}) {
+                       cluck "Undefined HTTP-Safe-ty for method \"".$W->{"r"}->method()."\"!"
+                                       if !defined($$_);
+                       $$_=0 if !defined $$_;
+                       }
+               }
+       if ($W->{"http_safe"}) {
+               Wrequire 'My::Hash::RecordKeys';
+               $W->{"headers_in_RecordKeys"}=My::Hash::RecordKeys->new($W->{"headers_in"});
+               $W->{"headers_in"}=$W->{"headers_in_RecordKeys"};
+               }
 
-       $W->{"browser"}=HTTP::BrowserDetect->new($W->{"user_agent"});
+       $W->{"browser"}=HTTP::BrowserDetect->new($W->{"headers_in"}{"User-Agent"});
 
        if (!defined $W->{"have_style"}) {
                $W->{"have_style"}=(!$W->{"browser"}->netscape() || ($W->{"browser"}->major() && $W->{"browser"}->major()>4) ? 1 : 0);
@@ -251,7 +279,7 @@ my($class,%args)=@_;
        return bless $W,$class;
 }
 
-# Although we have &tie-d *STDOUT we try to not to be dependent on it in My::Web itself.
+# Be aware other parts of code (non-My::Web) will NOT use this function!
 # Do not: Wprint $W->{"heading"},"undef"=>1;
 # as we would need to undef() it to turn it off and it would get defaulted in such case.
 # Do not: exists $W->{"heading"}
@@ -263,7 +291,9 @@ my($text,%args)=@_;
        cluck "undef Wprint" if !defined $text && !$args{"undef"};
        delete $args{"undef"};
        cluck join(" ","Invalid arguments:",keys(%args)) if keys(%args);
-       $W->{"r"}->puts($text) if defined $text;
+       return if !defined $text;
+       cluck "utf-8 untested" if Encode::is_utf8($text);
+       $W->{"r"}->puts($text);
 }
 
 sub escapeHTML($)
@@ -378,13 +408,11 @@ my($in,%args)=@_;
        return $uri->rel(unparsed_uri());
 }
 
-my %path_abs_disk_for_package; # $path_abs_disk_for_package{$W->{"__PACKAGE__"}}{$path_abs_disk}=1;
-
 sub path_abs_disk_register($)
 {
 my($path_abs_disk)=@_;
 
-       $path_abs_disk_for_package{$W->{"__PACKAGE__"}}{$path_abs_disk}=1;
+       $W->{"path_abs_disk_register"}{$path_abs_disk}=1;
 }
 
 # $args{"uri_as_in"}=1 to permit passing URI objects as: $in
@@ -517,17 +545,14 @@ sub footer (;$)
        exit 0;
 }
 
-sub header (%)
+# Existing entries are overwritten.
+sub header(%)
 {
 my(%pairs)=@_;
 
        while (my($key,$val)=each(%pairs)) {
                do { cluck "Headers already sent"; next; } if $W->{"heading_done"};
-               for ($W->{"headers_lc"}{lc $key} || ()) {
-                       delete $W->{"headers"}{$_};
-                       }
-               $W->{"headers_lc"}{lc $key}=$key;
-               $W->{"headers"}{$key}=$val;
+               $W->{"r"}->headers_out()->set($key,$val);
                }
 }
 
@@ -646,7 +671,7 @@ sub remote_ip ()
        # As 'Apache2::ForwardedFor' takes the first of $ENV{"HTTP_X_FORWARDED_FOR"}
        # while the contents is '127.0.0.1, 213.220.195.171' if client has its own proxy.
        # We must take the last item ourselves.
-       my $r=$W->{"r"}->headers_in()->{"X-Forwarded-For"} || $W->{"r"}->get_remote_host();
+       my $r=$W->{"headers_in"}{"X-Forwarded-For"} || $W->{"headers_in"}{"_get_remote_host"};
        $r=~s/^.*,\s*//;
        return $r;
 }
@@ -712,6 +737,13 @@ sub Negotiate_choose($$)
 {
 my($self,$variants)=@_;
 
+       # Limit these entries to generate proper 'Vary' header.
+       my %hash=(map(($_=>$W->{"headers_in"}{$_}),qw(
+                       Accept
+                       Accept-Charset
+                       Accept-Encoding
+                       Accept-Language
+                       )));
        my $best=HTTP::Negotiate::choose($variants,
                        # Do not: $W->{"r"}
                        # to prevent: Can't locate object method "scan" via package "Apache2::RequestRec" at HTTP/Negotiate.pm line 84.
@@ -719,7 +751,7 @@ my($self,$variants)=@_;
                        # to prevent: Can't locate object method "scan" via package "APR::Table" at HTTP/Negotiate.pm line 84.
                        # Do not: HTTP::Headers->new($W->{"r"}->headers_in());
                        # to prevent empty result or even: Odd number of elements in anonymous hash
-                       HTTP::Headers->new(%{$W->{"r"}->headers_in()}));
+                       HTTP::Headers->new(%hash));
        $best||=$variants->[0][0];      # $variants->[0]{"id"}; &HTTP::Negotiate::choose failed?
        return $best;
 }
@@ -821,7 +853,7 @@ my($class,$filename)=@_;
        return $F;
 }
 
-sub no_cache($)
+sub _no_cache($)
 {
 my($self)=@_;
 
@@ -839,53 +871,178 @@ my($self)=@_;
        header("Vary"=>"*");    # content may ba based on unpredictable sources
 }
 
-sub last_modified($)
+sub headers_in_filtered(@)
 {
-my($self)=@_;
+my(@keys)=@_;
+
+       return map(($_=>$W->{"headers_in"}{$_}),@keys);
+}
+
+our %uri_args_frozen_to_headers_in_keys;
+our %uri_args_headers_in_frozen_to_headers_out;
+
+sub uri_args_headers_in_frozen_get($)
+{
+my($headers_in_keys_arrayref)=@_;
+
+       my %uri_args_headers_in_hash=(
+               "uri_args_frozen"=>$W->{"uri_args_frozen"},
+               "headers_in"=>{ headers_in_filtered(@$headers_in_keys_arrayref) },
+               );
+       return do { local $Storable::canonical=1; Storable::freeze(\%uri_args_headers_in_hash); };
+}
+
+sub cache_output_filter($)
+{
+my($f)=@_;
+
+       while ($f->read(my $text,0x400)) {
+               cluck "utf-8 untested" if Encode::is_utf8($text);       # Possible here at all?
+               $f->print($text);
+               $W->{"digest-md5"}->add($text);
+               }
+       return OK;
+}
+
+sub cache_start()
+{
+       if (!$W->{"http_safe"}) {
+               __PACKAGE__->_no_cache();
+               return;
+               }
 
-       return if !$packages_used_hash{$W->{"__PACKAGE__"}}{"_done"};
-       our %path_abs_disk_registered;
-       if (!$path_abs_disk_registered{$W->{"__PACKAGE__"}}++) {
-               for my $package_orig (@{$packages_used_array{$W->{"__PACKAGE__"}}}) {
-                       local $_=$package_orig.".pm";
-                       s{::}{/}g;
-                       path_abs_disk "/$_","register"=>1;
+       {
+               # &Wrequire it here even if it will not be later used; to be stable!
+               Wrequire 'My::Hash::RestrictTo';
+               my %uri_args_hash=(
+                       "uri"=>"http://".$W->{"web_hostname"}."/".$W->{"r"}->uri(),
+                       "args"=>$W->{"args_orig_array"},
+                       );
+               $W->{"uri_args_frozen"}=do { local $Storable::canonical=1; Storable::freeze(\%uri_args_hash); };
+               last if !(my $headers_in_keys_arrayref=$uri_args_frozen_to_headers_in_keys{$W->{"uri_args_frozen"}});
+
+               # Protection to be sure we are stable:
+               $W->{"headers_in"}=My::Hash::RestrictTo->new($W->{"headers_in"},@$headers_in_keys_arrayref);
+
+               $W->{"uri_args_headers_in_frozen"}=uri_args_headers_in_frozen_get($headers_in_keys_arrayref);
+               last if !(my $headers_out_hashref=$uri_args_headers_in_frozen_to_headers_out{$W->{"uri_args_headers_in_frozen"}});
+               header(%$headers_out_hashref);
+               my $status;
+               {
+                       # &meets_conditions will always deny the attempt if !2xx status().
+                       # At least ap_read_request() sets: r->status=HTTP_REQUEST_TIME_OUT;     /* Until we get a request */
+                       my $status_old=$W->{"r"}->status();
+                       $W->{"r"}->status(HTTP_OK);
+                       # Update httpd's 'r->mtime' as the header "Last-Modified" is just not enough for ap_meets_conditions():
+                       # &update_mtime() argument is really in _secs_, not in _msecs_ as the docs claim.
+                       # Be aware '*1000000' would overflow Perl integer anyway.
+                       # &set_last_modified would also override the "Last-Modified" headers_out!
+                       # &mtime may exist but somehow does not work.
+                       $W->{"r"}->update_mtime(HTTP::Date::str2time($headers_out_hashref->{"Last-Modified"}));
+                       $status=$W->{"r"}->meets_conditions();
+                       $W->{"r"}->status($status_old);
                        }
+               last if OK==$status;
+               $W->{"r"}->status($status);
+               exit 0;
+               die "NOTREACHED";
+               }
+
+       $W->{"digest-md5"}=Digest::MD5->new();
+       $W->{"cache_active"}=1;
+       $W->{"r"}->add_output_filter(\&cache_output_filter);
+}
+
+sub cache_finish_last_modified()
+{
+       cluck "Not yet done now? W __PACKAGE__: ".$W->{"__PACKAGE__"}
+                       if !$packages_used_hash{$W->{"__PACKAGE__"}}{"_done"};
+       for my $package_orig (@{$packages_used_array{$W->{"__PACKAGE__"}}}) {
+               local $_=$package_orig.".pm";
+               s{::}{/}g;
+               path_abs_disk "/$_","register"=>1;
                }
        my $mtime_newest;
-       for my $path_abs_disk (keys(%{$path_abs_disk_for_package{$W->{"__PACKAGE__"}}})) {
-###print STDERR "CHECK:$path_abs_disk\n";
+       for my $path_abs_disk (keys(%{$W->{"path_abs_disk_register"}})) {
                my $mtime=(stat $path_abs_disk)[9];
                do { cluck "No mtime for: $path_abs_disk"; next; } if !$mtime;
                $mtime_newest=$mtime if !$mtime_newest || $mtime_newest<$mtime;
                }
        cluck "No mtime_newest found for the current W __PACKAGE__: ".$W->{"__PACKAGE__"}
                        if !$mtime_newest;
-       # "Vary" header is REQUIRED in this case:
-       header("Last-Modified"=>HTTP::Date::time2str($mtime_newest));
-       return 1;
+       return HTTP::Date::time2str($mtime_newest);
+}
+
+
+sub cache_finish()
+{
+       # Do not: return if !$W->{"uri_args_frozen"};
+       # as we may have just gave 304 and 'exit 0;' without starting the caching.
+       return if !$W->{"cache_active"};
+
+       # Fill-in/check: %uri_args_frozen_to_headers_in_keys
+       my $headers_in_keys_stored_arrayref_ref=\$uri_args_frozen_to_headers_in_keys{$W->{"uri_args_frozen"}};
+       my @headers_in_keys=tied(%{$W->{"headers_in_RecordKeys"}})->accessed();
+       if (!$$headers_in_keys_stored_arrayref_ref
+                       || !Data::Compare::Compare(\@headers_in_keys,$$headers_in_keys_stored_arrayref_ref)) {
+               cluck "Non-matching generated 'headers_in_keys' per 'uri_args_frozen' key:\n"
+                                               .Dumper(\@headers_in_keys,$$headers_in_keys_stored_arrayref_ref)
+                               if $$headers_in_keys_stored_arrayref_ref;
+               # Build or possibly prevent such further warn dupes:
+               $$headers_in_keys_stored_arrayref_ref=\@headers_in_keys;
+               # Build or regenerate as obsoleted now:
+               $W->{"uri_args_headers_in_frozen"}=uri_args_headers_in_frozen_get(\@headers_in_keys);
+               }
+
+       # Prepare 'headers_out' for the future reusal:
+       my %headers_out;
+       $headers_out{"Content-MD5"}=$W->{"digest-md5"}->b64digest();
+       # In fact we could also use MD5 for ETag as if we know ETag we also know MD5.
+       # But this way we do not need to calculate MD5 and we still can provide such ETag. So.
+       # $W->{"r"}->set_etag() ?
+       $headers_out{"ETag"}='"'.Digest::MD5::md5_base64($W->{"uri_args_headers_in_frozen"}).'"';
+       # $W->{"r"}->set_content_length() ?
+       $headers_out{"Content-Length"}=$W->{"r"}->bytes_sent();
+       my %Vary=map(($_=>1),(@headers_in_keys));
+       for (keys(%Vary)) {
+               next if !/^_/;
+               $Vary{"*"}=1;
+               delete $Vary{$_};
+               }
+       %Vary=("*"=>1) if $Vary{"*"};
+       $headers_out{"Vary"}=join(", ",sort keys(%Vary));
+       # $W->{"r"}->set_last_modified() ?
+       $headers_out{"Last-Modified"}=cache_finish_last_modified();
+
+       # Fill-in/check: %uri_args_headers_in_frozen_to_headers_out
+       my $headers_out_stored_hashref_ref=\$uri_args_headers_in_frozen_to_headers_out{$W->{"uri_args_headers_in_frozen"}};
+       if (!$$headers_out_stored_hashref_ref
+                       || !Data::Compare::Compare(\%headers_out,$$headers_out_stored_hashref_ref)) {
+               cluck "Non-matching generated 'headers_out' per 'uri_args_headers_in_frozen' key:\n"
+                                               .Dumper(\%headers_out,$$headers_out_stored_hashref_ref)
+                               if $$headers_out_stored_hashref_ref;
+               # Build or possibly prevent such further warn dupes:
+               $$headers_out_stored_hashref_ref=\%headers_out;
+               }
+
+###print STDERR Dumper(\%uri_args_frozen_to_headers_in_keys,\%uri_args_headers_in_frozen_to_headers_out);
 }
 
 sub heading()
 {
 my($class)=@_;
 
+       if (!$W->{"header_only"}) {
+               header("Content-Style-Type"=>"text/css");
+               header("Content-Script-Type"=>"text/javascript");
+               # $W->{"r"}->content_languages() ?
+               do { header("Content-Language"=>$_) if $_; } for $W->{"language"};
+               }
+       # TODO: Support also: private
+       header("Cache-Control"=>"public");      # HTTP/1.1
+
        # $ENV{"CLIENT_CHARSET"} ignored (mod_czech support dropped!)
        my $client_charset=$W->{"force_charset"} || "us-ascii";
-       header("Content-Style-Type"=>"text/css");
-       header("Content-Script-Type"=>"text/javascript");
-       do { header("Content-Language"=>$_) if $_; } for $W->{"language"};
-       $class->last_modified() if !$W->{"no_cache"};
-       $class->no_cache() if $W->{"no_cache"};
-
-       while (my($key,$val)=each(%{$W->{"headers"}})) {
-               $W->{"r"}->headers_out()->{$key}=$val;
-               }
-       exit if $W->{"r"}->header_only();
-       return if $W->{"header_only"};
-       # We still can append headers before we put out some text.
-       # FIXME: It is not clean to still append them without overwriting.
-       return if $W->{"heading_done"}++;
 
        # Workaround bug
        #   https://bugzilla.mozilla.org/show_bug.cgi?id=120556
@@ -893,7 +1050,9 @@ my($class)=@_;
        #   Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8b) Gecko/20050217
        my $mime;
        # http://validator.w3.org/ does not send ANY "Accept" headers!
-       $mime||="application/xhtml+xml" if !$W->{"accept"} && $W->{"user_agent"}=~m{^W3C_Validator/}i;
+       $mime||="application/xhtml+xml" if 1
+                       && !$W->{"headers_in"}{"Accept"}
+                       && ($W->{"headers_in"}{"User-Agent"}||"")=~m{^W3C_Validator/}i;
        $mime||=$class->Negotiate_choose([
                        # Put the fallback variant as the first one.
                        # Rate both variants the same to prefer "text/html" for undecided clients.
@@ -918,7 +1077,16 @@ my($class)=@_;
                        # application/xml ?
                        # text/xml ?
                        ]);
+       # mod_perl doc: If you set this header via the headers_out table directly, it
+       #               will be ignored by Apache. So do not do that.
        $W->{"r"}->content_type("$mime; charset=$client_charset");
+
+       cache_start();
+       return if $W->{"header_only"};
+       # We still can append headers before we put out some text.
+       # FIXME: It is not clean to still append them without overwriting.
+       return if $W->{"heading_done"}++;
+
        Wprint '<?xml version="1.0" encoding="'.$client_charset.'"?>'."\n" if $mime=~m{^application/\w+[+]xml$};
        return if $W->{"xml_header_only"};
        Wprint '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'."\n";