#! /usr/bin/perl -w

# (c) 2000 Matthew Astley <matthew@fruitcake.demon.co.uk>
# Released under the GNU General Public Licence v2 or later
# (hmm, need to add the blurb <sigh>)

=head1 README

Tail-follow/slurp-entire some logfiles and spew the lot to stdout with
unix timestamps and filenames.

The output format is suitable for 'spewfilter.pl', yet another configurable logfile watcher.

=head1 BUGS & WISHLIST

If you have something other than a Debian/GNU Linux system then you
might wish to change FileSelection(). If you do this, follow the
pattern and send me a patch!

=cut

# $Id: logspew.pl,v 1.3 2001/05/10 00:53:15 mca1001 Exp $

# ideas for improvements
#    improve the year-guessing code for log lines that have no year! (Y2K every year...)
#    things that should probably be implemented elsewhere
#      generalised cowthrottle (load & /proc/apm)  (File::Tail probably updates the atimes...)
#      run a command at intervals to check that certain services are still running
#    make a stab at pulling lines that get written to several log files?
#    make current/old/dull selection more easily configurable
#    during slurp mode, -p should periodically output the most recent date seen (progress meter)
#    check whether the code to open new files works after log rotation
#    select on pipes/sockets/filehandle attached to processes
#    hmm, since the slurped compressed logs come from processes ... generalise
#      to slurp & tail at the same time? needs more advanced config
#    others marked with "^^^"

# Be warned that dodgy filenames could get executed rather than read!
# (shell escapes while unzipping)

# Speed: slurping huge chunks on a P2-400MHz,
#   without calling spew() on each line, 5.2MB/user-CPU-sec
#   doing the slow pattern matches to extract times, more like 300kB/user-CPU-sec
# stats for 280MB of logs from a moderately busy firewall running Debian 2.2
# note that the times don't include child processes, eg. zcat

############################################################################

use strict;
use Getopt::Long;
use Time::Local;

use vars qw(%opts @stderr %lasttime %months);

# %opts - command line switches

# @stderr - array onto which warnings are pushed, so they can be
#	    treated as another stream

# %lasttime - key = filename, value = unixtime of last line with a date on it
# (allows a timestamp for each line in a multi-line message - but don't sort them!)

%opts = (
	 'follow'   => 0,
	 'ignored'  => 0,
	 'logdir'   => '/var/log',  # should be OS dependent^^^ (also see ArgSanity)
	 );

GetOptions(\%opts,
	   "help|?",
	   "follow|f",
	   "ignored|i",
	   "logdir|l=s",
	   );
ArgSanity(); # help message etc.

@months{qw(jan feb mar apr may jun jul aug sep oct nov dec)} = (0 .. 11);

@stderr = ();
#$SIG{'__WARN__'} = sub { push @stderr, (join " ", "[warn]", @_) };
#$SIG{'__DIE__'}  =
#    sub { my $msg = join " ", "[die]", @_;
#	  die(@stderr
#	      ? (join "", "Buffered warnings:\n", @stderr, $msg )
#	      : $msg ); };

$SIG{'INT'} = $SIG{'TERM'} = sub { exit; };
END {
    my ($rsec, $usec, $kb) = (time()-$^T, (times())[0], $stats::bytes / 1024 );
    warn "Processed ${kb}kB in $rsec sec(real) / $usec sec (user CPU time)\n";
    warn "  ".($kb / $usec)."kB/CPU-sec\n" unless $usec == 0;
}

chdir $opts{'logdir'} or die "Can't chdir to $opts{'logdir'}: $!";

if ($opts{'follow'}) {
    # there's a Debian package (libfile-tail-perl) for this...
    require File::Tail;
    import File::Tail 0.91;

    my %files; # key = filename, value = filehandle/tail object
    $| = 1;

    while (1) {
	foreach my $file (FileSelection()) {
	    next if defined $files{$file};  # if we're going round again, just open new files

	    # may be horribly wrong if the file has been messed
	    # with... just provides a default in case there's no
	    # timestamp
	    $lasttime{$file} = $^T - (86400 * -M $file);

	    my $tail = eval {
# File::Tail code pinched from select_demo
	      File::Tail->new('name'=>$file,
			      'debug'=>0,
			      'errmode'=>'die',
			      'maxinterval'=>5, 'interval'=>1, # impatient!
			      );
	    }; # may croak
	    if ($@) {
		warn $@;
	    } else {
		$files{$file} = $tail;
	    }
	}

	my @files = values %files;
#	print "Watching ".(join ", ", keys %files)."\n";

	my $scantime = time() + 3600; # we'll check for new files after this time
	while (time < $scantime) {
	    my ($nfound,$timeleft,@pending)=
	      File::Tail::select(undef,undef,undef,1, @files);
#	    unless ($nfound) {
#		my @ints;
#		foreach(@files) {
#		    push(@ints,$_->interval);
#		}
#		print "Nothing new! - ".localtime(time)."(".join(",",@ints).")\n";
#	    }
	    foreach (@pending) {
		unless ($_->predict) {
		    my $fn = $_->{'input'};

		    # timestamps on logs that have no time info will
		    # be at most 30sec wrong (follow mode only!)
		    $lasttime{$fn} = time() if $lasttime{$fn}+30 < time();
$stats::bytes = (defined $stats::bytes ? $stats::bytes : 0) + length($_);
		    spew($fn, $_->read);
		}
	    }
	}
    }
} else {    # not $opts{'follow'}  --> slurp
    require FileHandle;
    import FileHandle;

    my %files; # key = filename, value = M-time
    foreach my $file (FileSelection()) {
	$lasttime{$file} =
	$files{$file} = $^T - (86400 * -M $file);
	die "M-time is undef for $file" unless defined -M $file;
    }
    foreach my $file (sort {$files{$a} <=> $files{$b}} keys %files) {
	my $fh = new FileHandle;
	my $thingie = "< $file";
	if ($file =~ /\.(gz|z|Z)$/) {
	    $thingie = "zcat $file |";
	} elsif ($file =~ /\.bz2$/) {
	    $thingie = "bzcat $file |";
	}
	if (not $fh->open($thingie)) {
	    warn "Read $file failed: $!\n";
	} else {
	    warn "Slurping $file\n";
	    while (<$fh>) {
$stats::bytes = (defined $stats::bytes ? $stats::bytes : 0) + length($_);
		spew($file, $_);
	    }
	}
    }
}

print join "", @stderr;
exit;


############################################################################

sub spew # arg = (filename, line)
{
    # sorry, I8n people - the date regexps are all in English. 8-/

    my ($fn, $ln) = (shift, shift);
    my ($wkday, $month, $mnum, $mday, $hr, $mn, $sc, $yr) = ();
    my $utime;
    my $oln = $ln; # in case of screwups with date extraction

    # ^^^ This code pays no attention to timezones. I could claim that
    # I assume I'm working in local time, but I don't check that
    # things aren't in GMT

    # Date extraction code will pull various formats, it sticks a '?'
    # on the end when it's guessing (unless it's guessing the year,
    # then it's just crispy rolls and pastries)

    # Sorry it's a mess  8-)

# [MTWFS][ouehra][neduitn]       matches English three-letter weekdays (among other things)
# [JFMASOND][aepuco][nbrylgptvc] matches English three-letter month names (among other things)

# ^^^ it would be quite neat to just dump the numbers in the relevant
# variables and just have one instance of the timelocal call and (when
# it arrives) year guessing code

	if ($ln =~ s/^\[(Mon|Tue|Wed|Thu|Fri|Sat|Sun) ([JFMASOND][aepuco][nbrylgptvc]) ([ \d]\d) (\d{2}):(\d{2}):(\d{2}) (\d{4})\] //) {
	# [Mon Jan  2 12:34:45 2017] ... (something^^^ uses this format)
	($wkday, $month, $mday, $hr, $mn, $sc, $yr) = ($1, $2, $3, $4, $5, $6, $7);
	if (defined $months{lc($month)}) {
	    $mnum = $months{lc($month)};
	    $utime = timelocal($sc, $mn, $hr, $mday, $mnum, $yr);
	} else {
	    warn "Invalid month '$month' in $fn near unixtime $lasttime{$fn}\n";
	    $ln = $oln;
	    $utime = $lasttime{$fn}."?";
	}
    } elsif ($ln =~ s@^(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2}) @@) {
	# 2017-01-02 12:34:56 ... (exim uses this format)
	($yr, $mnum, $mday, $hr, $mn, $sc) = ($1, $2 - 1, $3, $4, $5, $6);
	$utime = timelocal($sc, $mn, $hr, $mday, $mnum, $yr);
    } elsif ($ln =~ s@^(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2}):(\d{2})\| @@) {
	# 2017/01/02 12:34:56| ... (squid/cache.log uses this format)
	($yr, $mnum, $mday, $hr, $mn, $sc) = ($1, $2 - 1, $3, $4, $5, $6);
	$utime = timelocal($sc, $mn, $hr, $mday, $mnum, $yr);
    } elsif ($ln =~ s@^\[(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2}):(\d{2}), (\d+)\]@[$7]@) {
	# [2017/01/02 12:34:56, 9] ---> [9]    (samba uses this format. What's the digit for? TZ?)
	($yr, $mnum, $mday, $hr, $mn, $sc) = ($1, $2 - 1, $3, $4, $5, $6);
	$utime = timelocal($sc, $mn, $hr, $mday, $mnum, $yr);
    } elsif ($ln =~ s/^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) ([ \d]\d) (\d{2}):(\d{2}):(\d{2}) //) {
	# Jan  2 12:34:56 .... (syslog/kernel uses this format, we play "guess the year")
	($month, $mday, $hr, $mn, $sc) = ($1, $2, $3, $4, $5);
	$yr = (localtime(time))[5]; # ARGH!^^^ at least base it on the file's mtime, and whether we're in December or January... or something
	die "Invalid month '$month' in $fn" # hey, it matches the pattern so I've no sympathy
	    unless defined $months{lc($month)};
	$mnum = $months{lc($month)};
	$utime = timelocal($sc, $mn, $hr, $mday, $mnum, $yr);
    } elsif ($ln =~ s/^([ \d]\d) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{2}):(\d{2}):(\d{2}) //) {
	# 18 Aug 18:48:58    (xntpd uses this format, we play "guess the year")
	($mday, $month, $hr, $mn, $sc) = ($1, $2, $3, $4, $5);
	$yr = (localtime(time))[5]; # ARGH!^^^ at least base it on the file's mtime, and whether we're in December or January... or something
	die "Invalid month '$month' in $fn" # hey, it matches the pattern so I've no sympathy
	    unless defined $months{lc($month)};
	$mnum = $months{lc($month)};
	$utime = timelocal($sc, $mn, $hr, $mday, $mnum, $yr);
    } elsif ($ln =~ s/^(Mon|Tue|Wed|Thu|Fri|Sat|Sun) ([JFMASOND][aepuco][nbrylgptvc]) ([ \d]\d) (\d{2}):(\d{2}):(\d{2}) (\d{4}) [A-Z]{3}: //) {
	# Mon Jan  2 12:34:56 2017 BST: ... diald.log format
	($wkday, $month, $mday, $hr, $mn, $sc, $yr) = ($1, $2, $3, $4, $5, $6, $7);
	die "Invalid month '$month' in $fn" # hey, it matches the pattern so I've no sympathy
	    unless defined $months{lc($month)};
	$mnum = $months{lc($month)};
	$utime = timelocal($sc, $mn, $hr, $mday, $mnum, $yr);
    } elsif ($ln =~ m/^Log from \S+ at (Mon|Tue|Wed|Thu|Fri|Sat|Sun) ([JFMASOND][aepuco][nbrylgptvc]) ([ \d]\d) (\d{2}):(\d{2}):(\d{2}) (\d{4})$/) {
	# Log from user@host at Mon Jan  2 12:34:56 2017 (cfingerd.log format)
	($wkday, $month, $mday, $hr, $mn, $sc, $yr) = ($1, $2, $3, $4, $5, $6, $7);
	die "Invalid month '$month' in $fn" # hey, it matches the pattern so I've no sympathy
	    unless defined $months{lc($month)};
	$mnum = $months{lc($month)};
	$utime = timelocal($sc, $mn, $hr, $mday, $mnum, $yr);
    } else {
	# if we can't extract a time from the line, use the one we had
	# last time (which may just be the M-time). If we're in follow
	# mode, this may get updated with the current time if it looks
	# ancient.
	warn "Time extraction may have missed something in $fn: $ln"
	    if $ln =~ /\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Mon|Tue|Wed|Thu|Fri|Sat|Sun)\b/;
	$utime = $lasttime{$fn}."?";
    }

# ^^^ maybe we should have some code to spot huge leaps in time on a particular file stream?

    if (defined $utime) {
	$lasttime{$fn} = $utime unless $utime =~ /\?$/; # ^^^ not sure the condition is sensible here...
    } else {
	$utime = '???';
    }
    print "$utime, $fn, $ln";
}


sub ArgSanity {
    die "Syntax: $0 [-f | --follow]   (default is slurp-mode)
		[-i | --ignored]   (list ignored files)
		[-l | --logdir] <directory>   (default is /var/log)
		[+] [<files...>]

Follow (like 'tail -f') or slurp (batch process) the specified
logfiles and filter them into bins as appropriate.

The default input file set will depend on your system. Currently
Debian GNU/Linux potato should work nicely but anything else will not
self-configure.

The log file set can be overridden on the command line. If the first
non-switch command line option is '+' instead of a filename then the
named files are taken in addition to the default ones, otherwise they
are used instead.

Files from the input set are sorted into current, old and
uninteresting, based on rules for your system. The obvious rule is
that 'foo' is current, while 'foo.0' and 'foo.1.gz' are old. Dull
files include web server logs, which are analysed by other programs.

Tail mode watches only current files, checking for new ones every
(hour). Slurp mode reads in old and current files and sorts them by
reverse modification time before dumping them.
" if $opts{'help'};
}


sub FileSelection # no args
#| Return three arrays, ([current logfiles], [old logfiles], [files to ignore])
{
    my $type = OSfinder();
    my ($curr, $old, $dull) = eval "return FileSelection_$type()";
    warn "I don't pay any attention to \@ARGV (commandline specified file set) at the moment. Sorry.\n" if @ARGV;
    die "$@ for FileSelection_$type()\nThat probably means I don't understand your box" if $@;
    warn "Ignored file list:\n  ".(join "\n  ", @{$dull})."\n" if $opts{'ignored'};

    my @files = (@{$curr}, ($opts{'follow'} ? () : @{$old}) );
    my @badfiles = grep /, /, @files;

    die ("Something is trying to slip some weird filenames in: ('".(join "'; '", @badfiles).
	 "'). This would break the output format quite badly, and without quoting ".
	 "the comma I'm not very flexible on this. Sorry.\n")
	if @badfiles;

    return @files;
}


sub OSfinder # no args
#| Return a string naming the OS
{
    my $type;
    my %sysguess = ('linux' => {"/etc/debian_version" => "DebianGNULinux",
				"/etc/redhat-release" => "Redhat",
			    },
		    'mswin32' => "laugh like a drain -- you're running Windoze aren't you?",
		    );
    # let's worry about version numbers for the system later on, eh?

    if (defined $sysguess{lc($^O)}) {
	my $syshash = $sysguess{lc($^O)};
	die $syshash unless ref $syshash;

	foreach my $sys (keys %{$syshash}) {
	    if (-r $sys) {
		my $t = $syshash->{$sys};
		die "You freak, your system appears to be both '$type' and '$t'"
		    if $type;
		$type = $t;
	    }
	}
	if (not $type) {
	    $type = 'DebianGNULinux';
	    warn "Unknown $^O system type, taking $type as a default";
	}
    } else {
	die "What sort of ship are you sailing here? \$^O = '$^O'";
    }

    return $type;
}


sub FileSelection_DebianGNULinux # same args/return as FileSelection
#| Reasonably specific to a 2.2 Debian box, and I've not checked the release number..
#| Roll your own and give it a name!
{
    open FIND, 'find . -type f |'
	or die "Can't read from find-file pipe: $!";
    my @files = map {s@^\./@@; $_} <FIND>;
    close FIND;

    my (@current, @old, @ignore);

# these files either aren't important or have some other program to
# notify you of changes
    my @ignorespec = qw(
apt-move.log
apache-ssl/access.log
apache-ssl/ssl.log
apache/access.log
dmesg
faillog
lastlog
ksymoops/
ntpstats/loopstats
ntpstats/peerstats
setuid.changes
setuid.today
setuid.yesterday
squid/store.log
squid/access.log
sxid.log
wtmp
wu-ftpd/xferlog
wwwoffle.log
			);

    foreach my $file (@files) {
	chomp $file;
	if ( grep {$_ eq substr($file, 0, length($_))} @ignorespec ) {
	    push @ignore, $file;
	} elsif ( $file =~ /\.(\d+|\d+\.gz)$/ ) {
	    push @old, $file;
	} else {
	    push @current, $file;
	}
    }

    return ([@current], [@old], [@ignore]);
}

