#!/usr/local/bin/perl
#
# squidcache2archive: convert files from Squid's cache format to
# a format more suitable for permanent archival
#
# Created 18 Jan 1999 by
# Gerald Oskoboiny <gerald@impressive.net>
# http://impressive.net/people/gerald/
#
# See also:
#     http://impressive.net/people/gerald/1999/01/http-archive/
#
# $Id: squidcache2archive,v 1.7 2000/08/29 03:27:20 gerald Exp $
#
# Usage:
#
# If invoked with "-catchup" as its first argument, it processes the
# entire $store_log and terminates.
#
# If invoked as something which matches "unlinkd", it assumes
# it's being run as a replacement for squid's unlinkd, and
# continually loops, archiving each cache filename given to it on
# stdin.
#
# Otherwise, it assumes it should be a daemon that does a tail -f
# on $store_log, alternately archiving everything and sleeping
# for a while to reduce the load on the system.

$me = $0;
$me =~ s,.*/,,;		# just want the basename

$| = 1;

$cache_basedir		= "/var/spool/squid";
$archive_basedir	= "/archives/http";
$log_base		= "$archive_basedir/log";
$squid_logs		= "$archive_basedir/squid-logs";
$store_log		= "$squid_logs/store.log";
$pidfile		= "$squid_logs/squidcache2archive.pid";

if ( ( $ARGV[0] eq "-catchup" ) || ( $ARGV[0] eq "-ketchup" ) ) {
    # process the whole shebang at once instead of tail -f'ing it
    $mode = "catchup";
    open( STORELOG, "< $store_log" ) ||
        die "unable to read store log, $store_log! $!";
    *FHANDLE = *STORELOG;
}
elsif ( $me =~ /unlinkd/ ) {
    $mode = "unlinkd";
    *FHANDLE = *STDIN;
}
else {
    $mode = "daemon";

    open( PIDFILE, "> $pidfile" ) || die "couldn't write to pid file! $!";
    print PIDFILE $$, "\n";
    close( PIDFILE ) || die "couldn't close pidfile! $!";

    open( STORELOG, "tail -f $store_log |" ) || die "unable to get tail! $!";
    *FHANDLE = *STORELOG;
}

$log = $log_base . ".$mode";
open( LOG, ">> $log" ) || die "couldn't append to log! $!";

select( LOG ); $| = 1;

while (<FHANDLE>) {

    chomp;

    if ( $mode eq "unlinkd" ) {
        $timestamp = time;
	$cache_file = $_;
    }
    else {

	# see http://irb.cs.uni-magdeburg.de/~elkner/proxy/Squid/storelog2.shtml
	# for docs on the format of store.log
	($timestamp,$tag,$swapnum,$status,$httpdate,$lastmod,
	    $expires,$ct,$length_and_size,$method,$url) = split;

	if ( $tag ne "SWAPOUT" ) {
	    &log( "skipping entry dated $timestamp because its tag is $tag");
	    next;
	}

	$timestamp =~ s/\..*//;	# don't care about milliseconds; strip 'em

	$temp = $swapnum;
	$temp =~ s,^..(..)(..)..,$1/$2,;
	$cache_file = $cache_basedir . "/" . $temp . "/" . $swapnum;
	
    }

    open( CACHEFILE, "< $cache_file" ) || do {
        &log( "couldn't read from cache file $cache_file! $!, skipping it.");
	next;
    };
    $line = "";
    while ( $line !~ /http/ ) {	# sometimes this stuff spans 2 or more lines
        chomp( $line .= <CACHEFILE> );
    }
    $url = $line;
    $url =~ s/.*\x00\x00\x00http/http/;
    $url =~ s/\x00.*//;

    # this is something like what the http-header separation code
    # could look like:
    #
    # $line = "foo";
    # while ( $line !~ /^\r?$/ ) {	# swallow up the header
    #     $line .= <CACHEFILE>;
    # }
    #
    # need better docs on squid's cache file format (or time to spend
    # reading its code) before spending any more time on this though...

    close( CACHEFILE ) || do {
        &log( "couldn't close cachefile $cache_file! $!" );
    };

    # encode the URL so it fits into the filesystem nicely yet
    # still retains most of its meaning
    $encoded_url = $url;
    $encoded_url =~ s,^http://,,;
    $encoded_url =~
        s/[^A-Za-z0-9:\.@#^&_+=\?\/-]/uc sprintf("%%%02x",ord($&))/eg;
    $encoded_url =~ s#/#,#g;

    ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =
        localtime($timestamp);

    # archive filename looks like this:
    #     /archives/http/1999/01/18/03:34:19/impressive.net,people,gerald,

    if ( $mode eq "unlinkd" ) {
	# change the filename slightly if we're running in unlinkd mode --
	# add an "aa:" before the HH:MM:SS, meaning "archived at" as
	# opposed to the default meaning which is "browsed at". These files
	# can be re-munged to the browsed-at timestamp later if/when I
	# figure out how to get that info out of the cached file; in the
	# meantime, no big whoop
	$archive_dir = $archive_basedir .
	    sprintf( "/%4d/%02d/%02d/aa:%02d:%02d:%02d",
	    $year+1900, $mon+1, $mday, $hour, $min, $sec );
    }
    else {
	$archive_dir = $archive_basedir .
	    sprintf( "/%4d/%02d/%02d/%02d:%02d:%02d",
	    $year+1900, $mon+1, $mday, $hour, $min, $sec );
    }

    system "mkdir -p $archive_dir";
    # @@ check return code

    $archive_filename = $archive_dir . "/" . $encoded_url;

    system "cp $cache_file \"$archive_filename\"";
    # @@ check return code

    &log( "archived $cache_file as $archive_filename" );

    if ( $mode eq "unlinkd" ) {
        &log( "unlinked $cache_file" );
	unlink( $cache_file ) || do {
	    &log( "unlink of $cache_file failed! $!" );
	    next;
	};
    }

    if ( $mode eq "daemon" ) {
        sleep( 2 );	# so we don't bog down the system actively catching
    }			# up after our 30m nap

    $current_time = time;
    if ( ( $current_time - $timestamp < 60 ) && ( $mode eq "daemon" ) ) {
	# if we're within 60s from the bottom of store.log, sleep for 5m
	# to avoid bogging down the system while active surfing is in
	# progress -- the unlinkd replacement will catch anything before
	# Squid can remove it anyway
	&log( "all caught up; sleeping for 5 mins." );
	sleep( 300 );
	&log( "aaah, that was refreshing! back to work!" );
    }

}

close( STORELOG ) || die "couldn't close STORELOG! $!";

# @@ this will never terminate naturally; should add a signal handler that
# dies nicely? (i.e., archives any pending stuff, then exits.)

exit;

sub log {

    my $message = shift;
    my $time = time;

    print LOG "$time $me\[$$\]: $message\n";

}

