#!/usr/bin/perl
#
# amavis-stats -- generate rrds from amavis log output
#
# Author: Mark Lawrence
#
# Copyright (C) 2003, Mark Lawrence (nomad@null.net)
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License with
#  the Debian GNU/Linux distribution in file /usr/share/common-licenses/GPL;
#  if not, write to the Free Software Foundation, Inc., 59 Temple Place,
#  Suite 330, Boston, MA  02111-1307  USA
#
# On Debian systems, the complete text of the GNU General Public
# License, version 2, can be found in /usr/share/common-licenses/GPL-2.


# ########################################################################
# Dependencies
# ########################################################################
use strict;
use warnings;
use Getopt::Std;
use Time::localtime;
use Time::Local;
use RRDs;
use Fcntl ':flock';


# ########################################################################
# Globals
# ########################################################################
our (
    $me,         # this program name
    $version,    # this program version
    $debug,      # result of the -d flag
    $verbose,    # result of the -v flag
    $pkg,        # name of this package (usually amavis-stats)
    $libdir,     # location of rrd files
    $lockfile,   # lock file to prevent more than one invocation at a time
    $logfile,    # input log file on command line
    $statefile,  # between invocation status file
    $namesfile,  # mappings of IDs to virus names
    $countfile,  # mappings of IDs to virus occurences
    $seenfile,   # mappings of IDs to virus first/last seen times
    $rrdstep,    # rrd step size
    $spos,       # start position of input file this run
    $pos,        # current position in input file
    $eof,        # position of end of input file
    $line,       # string containing current line
    $epoch,      # seconds since 1970
    $lastepoch,  # seconds since 1970, previous time around
    $numv,       # number of virus seen, continually incrementing
    $lastupdate, # epoch of last global rrd update
    $year,       # current year
    %rvid,       # in-memory mapping of virus names to IDs
    %vnames,     # in-memory mapping of IDs to virus names
    %occurence,  # in-memory mapping of IDs to counts/occurences
    %firstseen,  # in-memory mapping of virus first seen times
    %lastseen,   # in-memory mapping of virus last seen times
    %opt         # command line options
);


# ########################################################################
# Initial values & Constants
# ########################################################################
($me = $0) =~ s%.*/%%;    # get rid of the leading directory
$version   = "0.1.9"; # this value is auto-updated by packing system
$pkg       = "amavis-stats";
$lockfile  = "/var/lock/$pkg";
$libdir    = "/var/lib/$pkg";
$statefile = "$libdir/$pkg.state"; # last read position of the logfile
$namesfile = "$libdir/$pkg.names"; # stores the virus name to id mappings
$countfile = "$libdir/$pkg.count"; # per virus totals
$seenfile  = "$libdir/$pkg.seen";  # first and last time() seen
$rrdstep   = 300;

my %months = (
    "Jan" => "1", "Feb" => "2",  "Mar" => "3",  "Apr" => "4",
    "May" => "5", "Jun" => "6",  "Jul" => "7",  "Aug" => "8",
    "Sep" => "9", "Oct" => "10", "Nov" => "11", "Dec" => "12"
);


# ########################################################################
# Subroutines
# ########################################################################

#
# Message about this program and how to use it
#
sub usage() {
    print STDERR "usage: $0 [-hVqdr] file\n";
}

sub help() {
    usage();
    print STDERR << "EOF";

    Version: $version

    This program generates virus infection statistics from amavis/syslog
    log files. It is typically called from cron(8), but can also be used
    from the command line when populating the databases with historical
    data.

    -h        : this (help) message
    -v        : does nothing (legacy verbose option)
    -q        : quiet mode - no output
    -d        : print debugging messages to stderr
    -r        : reset file pointer to 0, instead of starting at last position
    -V        : display version and exit

    examples:

    Initial import of existing data:
    amavis# $0 /var/log/mail.info.2 
    amavis# $0 -r /var/log/mail.info.1 
    amavis# $0 -r /var/log/mail.info.0 
    amavis# $0 -r /var/log/mail.info 

    Normal usage:
    amavis# $0 /var/log/amavis.log 

EOF

}


#
# Command line options processing
#
sub init()
{
    dbg("$me version $version");

    my $opt_string = 'hvqdf:rV';
    if (!getopts( "$opt_string", \%opt )) {
        usage();
        exit 1;
    }

    if ($opt{h}) {
        help();
        exit 1;
    }

    if ($opt{V}) {
        print "$version\n";
        exit 1;
    }

    $verbose = 1;
    $verbose = 0 if ($opt{q});
    $debug   = 1 if $opt{d};


    if ($opt{f}) { # legacy way to specify input file
        $logfile = $opt{f};
        if ( ! -f $logfile ) {
            err("Could not open file $logfile");
            exit;
        }

    } elsif ($ARGV[0]) { # now expect file on command line
        $logfile = $ARGV[0];
        if ( ! -f $logfile ) {
            err("Could not open file $logfile");
            exit;
        }

    } else {
        usage();
        exit 1;
    }

    $year    = localtime->year() + 1900;


}


#
# Make sure that only one copy is running at a time
#
sub semlock {
    open (LOCKF, ">$lockfile") or die "Could not open $lockfile";
    unless (flock LOCKF, LOCK_EX | LOCK_NB) {
        err("File $lockfile locked by another process\n");
        exit;
    }
    print LOCKF "$$\n";
    dbg("Have lock on $lockfile\n");
}


#
# Undo our lock. This is only for the sake of completeness - all file
# handles are closed (and locks lost) on program exit anyway.
#
sub semunlock {
    unlink("$lockfile");
    close LOCKF;
}


#
# Load the values of the previous run into variables
#
sub loadState {
    dbg("loadState()");
    $spos = undef;

    #
    # Check that we have somewhere to save our status - Not much point
    # in continuing otherwise.
    #
    if ((! -d "$libdir") or (! -w "$libdir")) {
        err("$libdir does not exist or cannot be written to.");
        exit;
    }


    #
    # Grab the previous position reached in the log file, plus
    # the total number of different viruses we have seen
    #
    if (-f "$statefile") {

        dbg("opening file $statefile");

        open (IN, "$statefile") or die "Could not open $statefile";
        while (my $line = <IN>) {
            if ($line =~ /^pos:\s*(\d+)/) {
                $spos = $1;
            }
            elsif ($line =~ /^numv:\s*(\d+)/) {
                $numv = $1;
            }
            elsif ($line =~ /^lastupdate:\s*(\d+)/) {
                $lastupdate = $1;
            }
        }
        close IN;

        dbg("opening file $namesfile");
        open (IN, "$namesfile") or die "Could not open $namesfile";
        while (my $line = <IN>) {
            if ($line =~ /^(\d+)\s+(.*)/) {
                $rvid{$2} = $1;
                $vnames{$1} = $2;
            }
        }
        close IN;

        dbg("opening file $countfile");
        open (IN, "$countfile") or die "Could not open $countfile";
        while (my $line = <IN>) {
            if ($line =~ /^(\d+)\s+(.*)/) {
                $occurence{$1} = $2;
            }
        }
        close IN;

        dbg("opening file $seenfile");
        open (IN, "$seenfile") or die "Could not open $seenfile";
        while (my $line = <IN>) {
            if ($line =~ /^(\d+)\s+(\d+)\s+(\d+)/) {
                $firstseen{$1} = $2;
                $lastseen{$1}  = $3;
            }
        }
        close IN;
    }

    #
    # If we have not run before reset...
    #
    if (!defined $spos) {
        msg("First Time Run");
        $spos      = 0; # position into the log file
        $numv     = 0; # number of virus types seen
        $lastupdate = 0; # number of virus types seen
    }

    #
    # If -r <file> on command line start at beginning of file
    #
    if ($opt{r}) {
        $spos = 0;
    }

    if ($debug) {
        dbg("start position: $spos numv: $numv lastupdate: $lastupdate");
        while ( my ($id,$count) = each (%occurence)) {
            my $name = $vnames{$id};
            dbg("#$id: $name, seen $count times");
        }
    }
}


sub saveState {

    #
    # Reset the value of spos and save it for the next time we are called
    #
    dbg("saveState(): eof: $eof numv: $numv lastupdate: $lastupdate");

    open (OUT, ">$statefile") or die "Could not write to $statefile";
    print OUT "pos: $pos\n";
    print OUT "numv: $numv\n";
    print OUT "lastupdate: $lastupdate\n";
    close OUT;

    open (NAMES, ">$namesfile") or die "Could not write to $namesfile";
    open (COUNT, ">$countfile") or die "Could not write to $countfile";
    open (SEEN, ">$seenfile") or die "Could not write to $seenfile";

    while (my ($id, $virus) = each (%vnames)) {
        print NAMES "$id $virus\n";
        print COUNT "$id $occurence{$id}\n";
        print SEEN "$id $firstseen{$id} $lastseen{$id}\n";
    }
    close NAMES;
    close COUNT;
    close SEEN;

}


#
# Take a virus/name, and return an ID. Create the ID if the virus doesn't
# already exist
#
sub getVid {
    my ($virus, $epoch) = @_;
    my $id = 0;

    if (!exists $rvid{$virus}) {
        $numv++;
        $id = $numv;

        msg("New virus ($virus) (#$id) seen at $epoch");

        $vnames{$id}    = $virus;
        $rvid{$virus}   = $id;
        $firstseen{$id} = $epoch;

    } else {
        $id = $rvid{$virus};
    }

    return $id;
}


#
# Increment by one the number of times we have seen this virus. Also
# record the time we last saw it.
#
sub upCount {
    my ($id, $epoch) = @_;

    if (!exists $occurence{$id}) {
        $occurence{$id} = 1;

    } else {
        $occurence{$id}++;
    }

    $lastseen{$id} = $epoch;
}



#
#
#
sub classify {

    my ($mon, $day, $time, $host, $prog) = split(/\s+/, $line);

    #
    # When was this message received?
    #
    my ($hour,$min,$sec) = split (/:/, $time);
    $mon = $months{$mon};
    $epoch = timelocal($sec, $min, $hour, $day, $mon-1, $year-1900);

    if ($epoch > time()) {
        # date is last actually last year
        $epoch = timelocal($sec, $min, $hour, $day, $mon-1, $year-1901);
    }
    if (!defined $lastepoch) {
        $lastepoch = $epoch - 1;
    }
    dbg("line at $year-$mon-$day $hour:$min:$sec epoch: $epoch");

    #
    # Update all rrds if we are more than $rrdstep seconds since the last
    # update
    #
    if ($lastupdate == 0) {
        $lastupdate = int($epoch / $rrdstep) * $rrdstep;
        dbg("First update: $lastupdate");
    }

    my $count = int(($epoch - $lastupdate) / $rrdstep);
    for (my $i = 1; $i <= $count; $i++) {
        $lastupdate = $lastupdate + $rrdstep;
        foreach my $id (keys %vnames) {
            updateRRD($id, $lastupdate);
        }
    }

    #
    # Save the stats according to the classification of the email
    #
    if ($line =~ /Passed/) {
        dbg("passed: $epoch: $year-$mon-$day $hour:$min:$sec");

        #
        # Update the overall passed emails statistics
        #
        my $id = getVid("passed", $epoch);
        upCount($id, $epoch);
        updateRRD($id, $epoch);

    } elsif ($line =~ /\sSPAM(-TAG)*,\s/) {
        dbg("spam $epoch: $year-$mon-$day $hour:$min:$sec");

        #
        # Update the overall spam emails statistics
        #
        my $id = getVid("spam", $epoch);
        upCount($id, $epoch);
        updateRRD($id, $epoch);

    } elsif ($line =~ /\sBANNED\sname\/type\s/) {
        dbg("banned $epoch: $year-$mon-$day $hour:$min:$sec");

        #
        # Update the overall banned emails statistics
        #
        my $id = getVid("banned", $epoch);
        upCount($id, $epoch);
        updateRRD($id, $epoch);

    } elsif ($line =~ /\s(INFECTED)\s+\((.*?[\(.*?\)]*)\)/ or  # amavisd-new
            $line =~ /\s(quarantine[:|d;]).*?virus='(.*?)'/ or # amavisd
            $line =~ /(part-\d+):\s+(.*?)\s+FOUND/ ) {         # clamav
        my $viruses = $2;
        dbg("viruses: \"$viruses\" at $epoch: $year-$mon-$day $hour:$min:$sec");

        #
        # Update the overall infected emails statistics
        #
        my $id = getVid("infected", $epoch);
        upCount($id, $epoch);
        updateRRD($id, $epoch);

        #
        # What is this specific nasty little bugger(s) called?
        # Update his statistics as well.
        #
        my @list = split(/,*\s+/, $viruses);
        foreach my $virus (@list) {
            $id = getVid($virus, $epoch);
            upCount($id, $epoch);
            updateRRD($id, $epoch);
        }
    }
}




#
#
#
sub parseFile {    

    my ($fname, $start, $stop) = @_;
    dbg("parseFile ($fname, $start, $stop)");

    #
    # Open up the file we need to parse
    #
    unless (open (LOGFILE, $fname)) {
        err("Couldn't open logfile $fname"); 
        exit 2; 
    }
    unless (seek (LOGFILE, $start, 0)) {
        err("Couldn't seek to $start in logfile $fname"); 
        exit 3; 
    }

    #
    # Loop each line until the current end of file
    #
    $pos = $start;
    my $lineid = 0;
    while ($pos < $stop and $line = <LOGFILE>) 
    {
        $lineid++;
        $lastepoch = $epoch;

        if ($line =~ /amavis.*?\[\d+\]:/) {
            classify();
        }
        #
        # Where did we get to in the file?
        #
        $pos = tell(LOGFILE);

        #
        # Save the current statistics every 1000 lines. This way
        # if the program dies we don't have to start again from the 
        # beginning each time. Also good for monitoring the graphs
        # to see where we are up to.
        #
        if (!($lineid % 1000)) {
            saveState();;
        }

    }
    close(LOGFILE);    

}


#
# Find the previous (rotated) log file and parse that according to
# our last position
#
sub parseRotFile { 
    my ($logfile, $spos) = @_;

    my $now = time();
    my $today     = localtime($now);
    my $yesterday = localtime($now - 60*60*24);

    $today = sprintf("%4u%02u%02u", $today->year + 1900,
                                    $today->mon + 1,
                                    $today->mday);

    $yesterday = sprintf("%4u%02u%02u", $yesterday->year + 1900,
                                        $yesterday->mon + 1,
                                        $yesterday->mday);

    my $rotlogfile = undef;

    if (-f "$logfile.0") {
        $rotlogfile = $logfile . ".0";
    } elsif (-f "$logfile.1") {
        $rotlogfile = $logfile . ".1";
    } elsif (-f "$logfile.01") {
        $rotlogfile = $logfile . ".01";
    } elsif (-f "$logfile-$today") {
        $rotlogfile = $logfile . "-$today";
    } elsif (-f "$logfile-$yesterday") {
        $rotlogfile = $logfile . "-$yesterday";
    }

    if (defined($rotlogfile)) {
        parseFile ($rotlogfile, $spos, (stat $rotlogfile)[7]);
    } else {
        err("Could not open rotated logfile.");
        err("  Tried extentions .0, .1, .01, -$today, -$yesterday");
        exit;
    }
}


sub createRRD {
    my ($file, $epoch) = @_;

    RRDs::create($file,
                "--start", $epoch,
                "--step", $rrdstep,
                "DS:hits:COUNTER:".$rrdstep.":0:U",
                "RRA:AVERAGE:0.5:1:300",
                "RRA:AVERAGE:0.5:6:700",
                "RRA:AVERAGE:0.5:24:775",
                "RRA:AVERAGE:0.5:288:797",
                "RRA:MAX:0.5:1:300",
                "RRA:MAX:0.5:6:700",
                "RRA:MAX:0.5:24:775",
                "RRA:MAX:0.5:288:797"
                );

    my $err = RRDs::error;
    if ($err) {
        err("createRRD: $err");
        return -1;
    }

    return 1;
}


sub updateRRD () {
    my ($id, $epoch) = @_;
    my $count        = $occurence{$id};
    my $rrdfile      = "$libdir/$id.rrd";
    my $err;
    my $last;

    if (! -f $rrdfile) {
        if (! createRRD($rrdfile, $epoch - $rrdstep)) {
            err("updateRRD: Can't update file $rrdfile");
            die;
        }
        my $upd = ($epoch - $rrdstep) . ":0";
        RRDs::update($rrdfile, $upd);
    }

    dbg("Update: $rrdfile at $epoch count $count");

    $last = RRDs::last($rrdfile);
    $err = RRDs::error;
    if ($err) {
        err("updateRRD: $err");
        return -1;
    }

    #
    # We sometimes get two hits in the same second. Check for that here
    # and basically ignore it.
    #
    if ($epoch > $last) {
        my $upd = $epoch . ":" . $count;
        RRDs::update($rrdfile, $upd);

        $err = RRDs::error;
        if ($err) {
            err("updateRRD: $err");
            err("Attempted to update $rrdfile at $epoch count $count");
            return -1;
        }
    }

    return 1;
}



sub dbg {
    print "$me: @_\n" if ($debug);
}

sub msg {
    print "$me: @_\n" if ($verbose);
}

sub err {
    print STDERR "$me: error: @_\n";
}


# ########################################################################
# main() program
# ########################################################################

init();
semlock();
loadState();

$eof = (stat $logfile)[7];

if ($eof < $spos) {
    #
    # The log file has rotated under us, so do the rotated logfile first.
    #
    msg("Logfile \"$logfile\" appears to have rotated");
    parseRotFile($logfile, $spos);
    $spos = 0; # reset to the start of the file
}

parseFile ($logfile, $spos, $eof);
saveState();
semunlock();

