#!/usr/local/bin/perl # # squidcache2archive: convert files from Squid's cache format to # a format more suitable for permanent archival # # Created 18 Jan 1999 by # Gerald Oskoboiny # http://impressive.net/people/gerald/ # # See also: # http://impressive.net/people/gerald/1999/01/http-archive/ # # $Id: squidcache2archive,v 1.7 2000/08/29 03:27:20 gerald Exp $ # # Usage: # # If invoked with "-catchup" as its first argument, it processes the # entire $store_log and terminates. # # If invoked as something which matches "unlinkd", it assumes # it's being run as a replacement for squid's unlinkd, and # continually loops, archiving each cache filename given to it on # stdin. # # Otherwise, it assumes it should be a daemon that does a tail -f # on $store_log, alternately archiving everything and sleeping # for a while to reduce the load on the system. $me = $0; $me =~ s,.*/,,; # just want the basename $| = 1; $cache_basedir = "/var/spool/squid"; $archive_basedir = "/archives/http"; $log_base = "$archive_basedir/log"; $squid_logs = "$archive_basedir/squid-logs"; $store_log = "$squid_logs/store.log"; $pidfile = "$squid_logs/squidcache2archive.pid"; if ( ( $ARGV[0] eq "-catchup" ) || ( $ARGV[0] eq "-ketchup" ) ) { # process the whole shebang at once instead of tail -f'ing it $mode = "catchup"; open( STORELOG, "< $store_log" ) || die "unable to read store log, $store_log! $!"; *FHANDLE = *STORELOG; } elsif ( $me =~ /unlinkd/ ) { $mode = "unlinkd"; *FHANDLE = *STDIN; } else { $mode = "daemon"; open( PIDFILE, "> $pidfile" ) || die "couldn't write to pid file! $!"; print PIDFILE $$, "\n"; close( PIDFILE ) || die "couldn't close pidfile! $!"; open( STORELOG, "tail -f $store_log |" ) || die "unable to get tail! $!"; *FHANDLE = *STORELOG; } $log = $log_base . ".$mode"; open( LOG, ">> $log" ) || die "couldn't append to log! $!"; select( LOG ); $| = 1; while () { chomp; if ( $mode eq "unlinkd" ) { $timestamp = time; $cache_file = $_; } else { # see http://irb.cs.uni-magdeburg.de/~elkner/proxy/Squid/storelog2.shtml # for docs on the format of store.log ($timestamp,$tag,$swapnum,$status,$httpdate,$lastmod, $expires,$ct,$length_and_size,$method,$url) = split; if ( $tag ne "SWAPOUT" ) { &log( "skipping entry dated $timestamp because its tag is $tag"); next; } $timestamp =~ s/\..*//; # don't care about milliseconds; strip 'em $temp = $swapnum; $temp =~ s,^..(..)(..)..,$1/$2,; $cache_file = $cache_basedir . "/" . $temp . "/" . $swapnum; } open( CACHEFILE, "< $cache_file" ) || do { &log( "couldn't read from cache file $cache_file! $!, skipping it."); next; }; $line = ""; while ( $line !~ /http/ ) { # sometimes this stuff spans 2 or more lines chomp( $line .= ); } $url = $line; $url =~ s/.*\x00\x00\x00http/http/; $url =~ s/\x00.*//; # this is something like what the http-header separation code # could look like: # # $line = "foo"; # while ( $line !~ /^\r?$/ ) { # swallow up the header # $line .= ; # } # # need better docs on squid's cache file format (or time to spend # reading its code) before spending any more time on this though... close( CACHEFILE ) || do { &log( "couldn't close cachefile $cache_file! $!" ); }; # encode the URL so it fits into the filesystem nicely yet # still retains most of its meaning $encoded_url = $url; $encoded_url =~ s,^http://,,; $encoded_url =~ s/[^A-Za-z0-9:\.@#^&_+=\?\/-]/uc sprintf("%%%02x",ord($&))/eg; $encoded_url =~ s#/#,#g; ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($timestamp); # archive filename looks like this: # /archives/http/1999/01/18/03:34:19/impressive.net,people,gerald, if ( $mode eq "unlinkd" ) { # change the filename slightly if we're running in unlinkd mode -- # add an "aa:" before the HH:MM:SS, meaning "archived at" as # opposed to the default meaning which is "browsed at". These files # can be re-munged to the browsed-at timestamp later if/when I # figure out how to get that info out of the cached file; in the # meantime, no big whoop $archive_dir = $archive_basedir . sprintf( "/%4d/%02d/%02d/aa:%02d:%02d:%02d", $year+1900, $mon+1, $mday, $hour, $min, $sec ); } else { $archive_dir = $archive_basedir . sprintf( "/%4d/%02d/%02d/%02d:%02d:%02d", $year+1900, $mon+1, $mday, $hour, $min, $sec ); } system "mkdir -p $archive_dir"; # @@ check return code $archive_filename = $archive_dir . "/" . $encoded_url; system "cp $cache_file \"$archive_filename\""; # @@ check return code &log( "archived $cache_file as $archive_filename" ); if ( $mode eq "unlinkd" ) { &log( "unlinked $cache_file" ); unlink( $cache_file ) || do { &log( "unlink of $cache_file failed! $!" ); next; }; } if ( $mode eq "daemon" ) { sleep( 2 ); # so we don't bog down the system actively catching } # up after our 30m nap $current_time = time; if ( ( $current_time - $timestamp < 60 ) && ( $mode eq "daemon" ) ) { # if we're within 60s from the bottom of store.log, sleep for 5m # to avoid bogging down the system while active surfing is in # progress -- the unlinkd replacement will catch anything before # Squid can remove it anyway &log( "all caught up; sleeping for 5 mins." ); sleep( 300 ); &log( "aaah, that was refreshing! back to work!" ); } } close( STORELOG ) || die "couldn't close STORELOG! $!"; # @@ this will never terminate naturally; should add a signal handler that # dies nicely? (i.e., archives any pending stuff, then exits.) exit; sub log { my $message = shift; my $time = time; print LOG "$time $me\[$$\]: $message\n"; }