# Blosxom Plugin: lastmodified2
# Author(s): Frank Hecker <hecker@hecker.org>
# (based on work by Bob Schumaker <cobblers@pobox.com>)
# Version: 0.11
# Documentation: See the bottom of this file or type: perldoc lastmodified2

package lastmodified2;

use strict;

use HTTP::Date;
use Data::Dumper;
use POSIX qw! strftime !;

# Use the Digest:MD5 module if available, the older MD5 module if not.

my $use_digest;
my $use_just_md5;

BEGIN {
    if (eval "require Digest::MD5") {
	Digest::MD5->import();
	$use_digest = 1;
    }
    elsif (eval "require MD5") {
	MD5->import();
	$use_just_md5 = 1;
    }
}

# --- Package variables -----

my $current_time = time();              # Use consistent value of current time.
my $last_modified_time = 0;
my $etag = "";
my $md5_digest = "";
my %validator;

# --- Output variables -----

our $latest_rfc822 = '';
our $latest_iso8601 = '';

our $others_rfc822 = '';
our $others_iso8601 = '';

our $now_rfc822 = '';
our $now_iso8601 = '';

our $story_rfc822 = '';
our $story_iso8601 = '';

# --- Configurable variables -----

my $generate_etag = 1;                  # generate ETag header?

my $generate_mod = 1;                   # generate Last-modified header?

my $strong = 0;                         # do strong validation?

my $val_cache = "validator.cache";      # where to cache last-modified values
					# and MD5 digests (in state directory)

my $generate_expires = 1;               # generate Expires header?

my $generate_cache = 1;                 # generate Cache-control header?

my $generate_length = 1;                # generate Content-length header?

my $use_others = 0;                     # consult %others for weak validation
                                        # (DEPRECATED)

my $export_dates = 1;                   # set $latest_rfc822, etc., for
                                        # compatibility with lastmodified

my $debug = 0;                          # set > 0 for debug output

# Freshness times (in seconds) for each flavour, and the default value to
#  use if the flavour is not in the list. To disallow caching pages of a
# particular flavour, set the freshness time to zero for that flavour.

my %freshness_time = (
		      'html' => 0,
		      'atom' => 3000,
		      'rss' => 3000
		      );
my $default_freshness_time = 3000;

# --------------------------------


# Do any initial processing, and decide whether to activate the plugin.

sub start {
    warn "lastmodified2: start\n" if $debug > 1;

    # Don't activate this plugin if we are doing static page generation.

    return 0 if $blosxom::static_or_dynamic eq 'static';

    # If we can't do MD5 then we don't do strong validation.

    if ($strong && !($use_digest || $use_just_md5)) {
	$strong = 0;

	warn "lastmodified2: MD5 not available, forcing weak validation\n"
	    if $debug > 0;
    }

    # Limit freshness time to maximum of one year, must be non-negative.

    for my $flavour (sort keys %freshness_time) {
	$freshness_time{$flavour} > 365*24*3600
	    and $freshness_time{$flavour} = 365*24*3600;
	$freshness_time{$flavour} < 0
	    and $freshness_time{$flavour} = 0;
    }

    if ($debug > 1) {
	warn "lastmodified2: \$generate_etag = $generate_etag\n"; 
	warn "lastmodified2: \$generate_mod = $generate_mod\n"; 
	warn "lastmodified2: \$strong = $strong\n"; 
	warn "lastmodified2: \$generate_cache = $generate_cache\n"; 
	warn "lastmodified2: \$generate_expires = $generate_expires\n"; 
	warn "lastmodified2: \$generate_length = $generate_length\n"; 
    }

    # If we are using Last-modified as a strong validator then read
    # in the cached last-modified values and MD5 digests.

    if ($generate_mod && $strong &&
	open CACHE, "<$blosxom::plugin_state_dir/$val_cache" ) {

	warn "lastmodified2: loading cached validators\n" if $debug > 0;

	my $index = join '', <CACHE>;
	close CACHE;

	my $VAR1;
	$index =~ m!\$VAR1 = \{!
	    and eval($index) and !$@ and %validator = %$VAR1;
    }

    # Convert current time to RFC 822 and ISO 8601 formats for others' use.

    if ($export_dates && $current_time) {
	$now_rfc822 = HTTP::Date::time2str($current_time);
	$now_iso8601 = iso8601($current_time);
    }

    return 1;
}


# We check the list of entries to be displayed and determine the modification
# time of the most recent entry.

sub filter {
    my ($pkg, $files, $others) = @_;

    warn "lastmodified2: filter\n" if $debug > 1;

    # We can skip all this unless we're doing weak validation and/or we're
    # setting the *_rfc822 and *_iso8601 variables for others to use.

    return 1 unless $export_dates ||
	(($generate_etag || $generate_mod) && !$strong);

    # Find the latest date/time modified for the entries to be displayed.

    $last_modified_time = 0;
    for (values %$files) {
	$_ > $last_modified_time and $last_modified_time = $_;
    }

    warn "lastmodified2: \$last_modified_time = " .
	$last_modified_time . " (entries)\n" if $debug > 0;

    # Convert last modified time to RFC 822 and ISO 8601 formats for others.

    if ($export_dates && $last_modified_time) {
	$latest_rfc822 = HTTP::Date::time2str($last_modified_time);
	$latest_iso8601 = iso8601($last_modified_time);
    }

    # Optionally look at other files as well (DEPRECATED).

    if ($use_others) {
	my $others_last_modified_time = 0;
	for (values %$others) {
	    $_ > $others_last_modified_time
		and $others_last_modified_time = $_;
	}

	if ($export_dates && $others_last_modified_time) {
	    $others_rfc822 = HTTP::Date::time2str($others_last_modified_time);
	    $others_iso8601 = iso8601($others_last_modified_time);
	}

	warn "lastmodified2: \$others_last_modified_time = " .
	    $others_last_modified_time . " (others)\n" if $debug > 0;

	$others_last_modified_time > $last_modified_time
	    and $last_modified_time = $others_last_modified_time;
    }

    # If we're doing weak validation then create an etag based on the latest
    # date/time modified and mark it as weak (i.e., by prefixing it with 'W/').

    if ($generate_etag && !$strong) {
	$etag = 'W/"' . $last_modified_time . '"';

	warn "lastmodified2: \$etag = $etag\n" if $debug > 0;
    }

    return 1;
}


# Skip story processing and generate configured headers now on a conditional
# GET request for which we don't need to return a full response.

sub skip {
    warn "lastmodified2: skip\n" if $debug > 1;

    # If we are doing strong validation then we can't skip story processing
    # because we need all output in order to generate the proper etag and/or
    # last-modified value.

    return 0 unless ($generate_etag || $generate_mod) && !$strong;

    # Otherwise we can check here whether we can send a 304 or not.

    my $send_304 = check_for_304();

    # If we don't need to return a full response on a conditional GET then
    # set the HTTP status to 304 and generate headers as configured.
    # (We have to do this here because the last subroutine won't be executed
    # if we skip story processing.)

    add_headers($send_304) if $send_304;

    return $send_304;
}


# Set variables with story date/time in RFC 822 and ISO 8601 formats.

sub story {
    my ($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;

    warn "lastmodified2: story (\$path = $path, \$filename = $filename)\n"
	if $debug > 1;

    if ($export_dates) {
	$path ||= "";

	my $timestamp =
	    $blosxom::files{"$blosxom::datadir$path/$filename.$blosxom::file_extension"};

	warn "lastmodified2: \$timestamp = $timestamp\n" if $debug > 0;

	$story_rfc822 = $timestamp ? HTTP::Date::time2str($timestamp) : '';
	$story_iso8601 = $timestamp ? iso8601($timestamp) : '';
    }

    return 1;
}


# Do conditional GET checks if we couldn't do them before (i.e., we are
# doing strong validation and couldn't skip story processing) and output
# any configured headers plus a 304 status if appropriate.

sub last {
    warn "lastmodified2: last\n" if $debug > 1;

    # If some other plugin has set the HTTP status to a non-OK value then we
    # don't attempt to do anything here, since it would probably be wrong.

    return 1 if $blosxom::header->{'Status'} &&
	$blosxom::header->{'Status'} !~ m!^200 !;

    # If we are using ETag and/or Last-modified as a strong validator then
    # we generate an entity tag from the MD5 message digest of the complete
    # output. (We use the base-64 representation if possible because it is
    # more compact than hex and hence saves a few bytes of bandwidth.)

    if (($generate_etag || $generate_mod) && $strong) {
	$md5_digest =
	    $use_digest ? Digest::MD5::md5_base64($blosxom::output)
	                : MD5->hex_hash($blosxom::output);
	$etag = '"' . $md5_digest . '"';

	warn "lastmodified2: \$etag = $etag\n" if $debug > 0;
    }

    # If we are using Last-modified as a strong validator then we look up
    # the cached MD5 digest for this URI, compare it to the current digest,
    # and use the cached last-modified value if they match. Otherwise we set
    # the last-modified value to just prior to the current time.

    my $cache_tag = cache_tag();
    my $update_cache = 0;

    if ($generate_mod && $strong) {
	if ($validator{$cache_tag} &&
	    $md5_digest eq $validator{$cache_tag}{'md5'}) {
	    $last_modified_time = $validator{$cache_tag}{'last-modified'};
	} else {
	    $last_modified_time = $current_time - 5;
	    $validator{$cache_tag}{'last-modified'} = $last_modified_time;
	    $validator{$cache_tag}{'md5'} = $md5_digest;
	    $update_cache = 1;
	}

	warn "lastmodified2: \$last_modified_time = $last_modified_time\n"
	    if $debug > 0;

    }

    # Do conditional GET checks and output configured headers plus status.

    my $send_304 = check_for_304();
    add_headers($send_304);

    # Update the validator cache if we need to. To minimize race conditions
    # we write the cache as a temporary file and then rename it.

    if ($update_cache) {
	warn "lastmodified2: updating validator cache\n" if $debug > 0;

	my $tmp_cache = "$val_cache-$$-$current_time";

	if (open CACHE, ">$blosxom::plugin_state_dir/$tmp_cache") {
	    print CACHE Dumper \%validator;
	    close CACHE;

	    warn "lastmodified2: renaming $tmp_cache to $val_cache\n"
		if $debug > 1;

	    rename("$blosxom::plugin_state_dir/$tmp_cache",
	           "$blosxom::plugin_state_dir/$val_cache")
		or warn "couldn't rename $blosxom::plugin_state_dir/$tmp_cache: $!\n";
	} else {
	    warn "couldn't > $blosxom::plugin_state_dir/$tmp_cache: $!\n";
	}
    }

    1;
}


# Check If-none-match and/or If-modified-since headers and return true if
# we can send a 304 (not modified) response instead of a normal response.

sub check_for_304 {
    my $etag_send_304 = 0;
    my $mod_send_304 = 0;
    my $etag_request = 0;
    my $mod_request = 0;
    my $send_304 = 0;

    warn "lastmodified2: check_for_304\n" if $debug > 1;

    # For a conditional GET using the If-none-match header, compare the
    # ETag value(s) in the header with the ETag value generated for the page,
    # set $etag_send_304 true if we don't need to send a full response,
    # and note that an etag value was included in the request.

    if ($ENV{'HTTP_IF_NONE_MATCH'}) {
	$etag_request = 1;
	if ($generate_etag) {
	    my @inm_etags = split '\s*,\s*', $ENV{'HTTP_IF_NONE_MATCH'};

	    if ($debug > 0) {
		for (@inm_etags) {
		    warn "lastmodified2: \$inm_etag = |" . $_ . "|\n";
		}
	    }

	    for (@inm_etags) {
		$etag eq $_ and $etag_send_304 = 1 and last;
	    }
	}
    }

    # For a conditional GET using the If-modified-since header, compare the
    # time in the header with the time any entry on the page was last modified,
    # set $mod_send_304 true if we don't need to send a full response, and
    # also note that a last-modified value was included in the request.

    if ($ENV{'HTTP_IF_MODIFIED_SINCE'}) {
	$mod_request = 1;
	if ($generate_mod) {
	    my $ims_time =
		HTTP::Date::str2time($ENV{'HTTP_IF_MODIFIED_SINCE'});

	    warn "lastmodified2: \$ims_time = " . $ims_time . "\n"
		if $debug > 0;

	    $mod_send_304 = 1 if $last_modified_time <= $ims_time;
	}
    }

    # If the request includes both If-none-match and If-modified-since then
    # we don't send a 304 response unless both tests agree it should be sent,
    # per section 13.3.4 of the HTTP 1.1 specification.

    if ($etag_request && $mod_request) {
	$send_304 = $etag_send_304 && $mod_send_304;
    } else {
	$send_304 = $etag_send_304 || $mod_send_304;
    }

    warn "lastmodified2: \$send_304 = " . $send_304 .
	    " \$etag_send_304 = " . $etag_send_304 .
	    " \$mod_send_304 = " . $mod_send_304 . "\n"
	if $debug > 0;

    return $send_304;
}


# Set status and add additional header(s) depending on the type of response.

sub add_headers {
    my ($send_304) = @_;

    warn "lastmodified2: add_headers (\$send_304 = $send_304)\n"
	if $debug > 1;

    # Set HTTP status and truncate output if we are sending a 304 response.

    if ($send_304) {
	$blosxom::header->{'Status'} = "304 Not Modified";
	$blosxom::output = "";

	warn "lastmodified2: Status: " .
	    $blosxom::header->{'Status'} . "\n" if $debug > 0;
    }

    # For the rules on what headers to generate for a 304 response, see
    # section 10.3.5 of the HTTP 1.1 protocol specification.

    # Last-modified is not returned on a 304 response.

    if ($generate_mod && !$send_304) {
	$blosxom::header->{'Last-modified'} =
	    HTTP::Date::time2str($last_modified_time);

	warn "lastmodified2: Last-modified: " .
	    $blosxom::header->{'Last-modified'} . "\n" if $debug > 0;
    }

    # If we send ETag on a 200 response then we send it on a 304 as well.

    if ($generate_etag) {
	$blosxom::header->{'ETag'} = $etag;

	warn "lastmodified2: ETag: " .
	    $blosxom::header->{'ETag'} . "\n" if $debug > 0;
    }

    # Determine how long we allow pages of this flavour to be cached.

    warn "lastmodified2: \$flavour = '" . $blosxom::flavour
	. "', \$freshness_time = '" . $freshness_time{$blosxom::flavour}
        . "'\n"
	if $debug > 1;

    my $time_to_cache =
	defined($freshness_time{$blosxom::flavour})
	? $freshness_time{$blosxom::flavour}
	: $default_freshness_time;

    # We send Expires for a 304 since its value is updated for each request.

    if ($generate_expires) {
	$blosxom::header->{'Expires'} = $time_to_cache ?
	    HTTP::Date::time2str($current_time + $time_to_cache) :
	    HTTP::Date::time2str($current_time - 60);

	warn "lastmodified2: Expires: " .
	    $blosxom::header->{'Expires'} . "\n" if $debug > 0;
    }

    # We send Cache-control for a 304 response for consistency with Expires.

    if ($generate_cache) {
	$blosxom::header->{'Cache-control'} =
	    $time_to_cache ? "max-age=" . $time_to_cache
	                   : "no-cache";

	warn "lastmodified2: Cache-control: " .
	    $blosxom::header->{'Cache-control'} . "\n" if $debug > 0;
    }

    # Content-length is not returned on a 304 response.

    if ($generate_length && !$send_304) {
	$blosxom::header->{'Content-length'} = length($blosxom::output);

	warn "lastmodified2: Content-length: " .
	    $blosxom::header->{'Content-length'} . "\n" if $debug > 0;
    }
}


# Generate a tag to look up the cached last-modified value and MD5 digest
# for this URI.

sub cache_tag {
    # Start with the original URI from the request.

    my $tag = $ENV{REQUEST_URI} || "";

    # Add an "/index.flavour" for uniqueness unless it's already present.

    unless ($tag =~ m!/index\.!) {
	$tag .= '/' unless ($tag =~ m!/$!);
	$tag .= "index.$blosxom::flavour";
    }

    return $tag;
}


# Convert time to ISO 8601 format (including time zone offset).
# (Format is YYYY-MM-DDThh:mm:ssTZD per http://www.w3.org/TR/NOTE-datetime)

sub iso8601 {
    my ($timestamp) = @_;
    my $tz_offset = strftime("%z", localtime());
    $tz_offset = substr($tz_offset, 0, 3) . ":" . substr($tz_offset, 3, 5);
    return strftime("%Y-%m-%dT%T", localtime($timestamp)) . $tz_offset;
}


1;

__END__

=head1 NAME

Blosxom Plug-in: lastmodified2

=head1 SYNOPSIS

Enables caching and validation of dynamically-generated Blosxom pages
by generating C<ETag>, C<Last-modified>, C<Cache-control>, and/or
C<Expires> HTTP headers in the response and responding appropriately
to an C<If-none-match> and/or C<If-modified-since> header in the
request. Also generates a C<Content-length> header to support HTTP 1.0
persistent connections.

=head1 VERSION

0.11

=head1 AUTHOR

Frank Hecker <hecker@hecker.org>, http://www.hecker.org/ (based on
work by Bob Schumaker, <cobblers@pobox.com>, http://www.cobblers.net/blog/)

=head1 DESCRIPTION

This plugin enables caching and validation of dynamically-generated
Blosxom pages by web browsers, web proxies, feed aggregators, and
other clients by generating various cache-related HTTP headers in the
response and supporting conditional GET requests, as described
below. This can reduce excess network traffic and server load caused
by requests for RSS or Atom feeds or for web pages for popular entries
or categories.

=head1 INSTALLATION AND CONFIGURATION

Copy this plugin into your Blosxom plugin directory. You should not
normally need to rename the plugin; however see the discussion below.

Configurable variables specify how the plugin handles validation
(C<$generate_etag>, C<$generate_mod>, and C<$strong>), caching
(C<$generate_cache>, C<$generate_expires>, and C<%freshness_time>) and
whether or not to generate any other recommended headers
(C<$generate_length>). The plugin supports the variable C<$use_others>
as used in the lastmodified plugin; however use of this is deprecated
(use strong validation instead). The variable C<$export_dates>
specifies whether to export date/time variables C<$latest_rfc822>,
etc., for compatibility with the lastmodified plugin.

You can set the variable C<$debug> to 1 or greater to produce
additional information useful in debugging the operation of the
plugin; the debug output is sent to your web server's error log.

This plugin supplies C<filter>, C<skip>, and C<last> subroutines. It
needs to run after any other plugin whose C<filter> subroutine changes
the list of entries included in the response; otherwise the
C<Last-modified> date may be computed incorrectly. It needs to run
after any other plugin whose C<skip> subroutine does redirection
(e.g., the canonicaluri plugin) or otherwise conditionally sets the
HTTP status to any value other than 200. Finally, this plugin needs to
run after any other plugin whose C<last> subroutine changes the output
for the page; otherwise the C<Content-length> value (and the C<ETag>
and C<Last-modified> values, if you are using strong validation) may
be computed incorrectly. If you are encountering problems in any of
these regards then you can force the plugin to run after other plugins
by renaming it to, e.g., 99lastmodified2.

=head1 SEE ALSO

Blosxom Home/Docs/Licensing: http://www.blosxom.com/

Blosxom Plugin Docs: http://www.blosxom.com/documentation/users/plugins.html

lastmodified plugin: http://www.cobblers.net/blog/dev/blosxom/

more on the lastmodified2 plugin: http://www.hecker.org/blosxom/lastmodified2

=head1 AUTHOR

Frank Hecker <hecker@hecker.org> http://www.hecker.org/

Based on the original lastmodified plugin by Bob Schumaker
<cobblers@pobox.com> http://www.cobblers.net/blog

=head1 LICENSE

This source code is submitted to the public domain.  Feel free to use
and modify it.  If you like, a comment in your modified source
attributing credit to myself, Bob Schumaker, and any other
contributors for our work would be appreciated.

THIS SOFTWARE IS PROVIDED AS IS AND WITHOUT ANY WARRANTY OF ANY KIND.
USE AT YOUR OWN RISK!
