#!/usr/bin/perl -w

#
#    Copyright 2008 Niklas Edmundsson <nikke@acc.umu.se>,
#                   Tomas Ögren <stric@acc.umu.se>,
#                   David Cameron <cameron@ndgf.org>
#
#    Originally developed by Niklas Edmundsson and Tomas Ögren as
#    cleanbyage. Modified, renamed cache-clean and maintained
#    for ARC by David Cameron.
#
#    Released under Apache License Version 2.0
#

use Sys::Hostname;
use File::Find ();
use File::Path;
use Getopt::Std;
use Fcntl ':mode';
use DirHandle;
use File::Basename;
use POSIX;

use strict;
use warnings;

# Set the variable $File::Find::dont_use_nlink if you're using AFS,
# since AFS cheats.

# for the convenience of &wanted calls, including -eval statements:
use vars qw/*name *dir *prune/;
*name   = *File::Find::name;
*dir    = *File::Find::dir;
*prune  = *File::Find::prune;

sub wanted;
sub debug;
sub printsize;
sub diskspace;

my(%opts);

my $configfile = '/etc/arc.conf';

# max/min used percentage
my $maxusedpercent = 0;
my $minusedpercent = 0;
my $expirytime = 0;

my %files;
my %expiredfiles;
my $totsize = 0;
my $totlocksize = 0;
my $totlockfiles = 0;

my $debuglevel = 'INFO';
my $currenttime = time();

getopts('hsSc:m:M:E:f:D:', \%opts);

if(defined($opts{c})) {
    $configfile = $opts{c};
}
if(defined($opts{M})) {
    $maxusedpercent = $opts{M};
}
if(defined($opts{m})) {
    $minusedpercent = $opts{m};
}
if(defined($opts{D})) {
    $debuglevel = $opts{D};
}
if(defined($opts{s})) {
    $debuglevel = 'ERROR';
}

if($maxusedpercent < 0 || $maxusedpercent > 100) {
    die "Bad value for -M: $maxusedpercent\n";
}
if($minusedpercent < 0 || $minusedpercent > 100) {
    die "Bad value for -m: $minusedpercent\n";
}
if($minusedpercent > $maxusedpercent) {
    die "-M can't be smaller than -m (now $maxusedpercent/$minusedpercent)\n";
}

if(defined($opts{E})) {
    if ($opts{E} =~ m/^(\d+)(\D)$/) {
        if ($2 eq 'd') {
        $expirytime = $1 * 86400;
        } elsif ($2 eq 'h') {
            $expirytime = $1 * 3600;
        } elsif ($2 eq 'm') {
            $expirytime = $1 * 60;
        } elsif ($2 eq 's') {
            $expirytime = $1;
        } else {
            die "Bad format in -E option value\n";
        }
    } elsif ($opts{E} =~ m/^\d+$/) {
        $expirytime = $opts{E};
    } else {
        die "Bad format in -E option value\n";
    }
}

sub usage {
    print <<EOH;
   usage: cache-clean [-h] [-s] [-S] [-m <NN> -M <NN>] [-E N] [-D debug_level]
            [-f space_command] [ -c <arex_config_file> | <dir1> [<dir2> [...]] ]
    -h       - This help
    -s       - Statistics mode, show cache usage stats, dont delete anything
    -S       - Calculate cache size rather than using used file system space
    -c       - path to an A-REX config file, xml or ini format
    -M NN    - Maximum usage of file system. When to start cleaning the cache (percent)
    -m NN    - Minimum usage of file system. When to stop cleaning cache (percent)
    -E N     - Delete all files whose access time is older than N. Examples
                 of N are 1800, 90s, 24h, 30d (default is seconds)
    -f command - Path and optionally arguments to a command which outputs
                 "total_bytes used_bytes" of the file system the cache is on.
                 The cache dir is passed as an argument to this command.
    -D level - Debug level, FATAL, ERROR, WARNING, INFO, VERBOSE or DEBUG.
                 Default is INFO

   Caches are given by dir1, dir2.. or taken from the config file specified
   by -c, ARC_CONFIG or the default /etc/arc.conf.

EOH
    exit 1;
}

usage() if(defined($opts{'h'}) || ((!defined($opts{'M'}) || !defined($opts{'m'})) && !defined($opts{'E'}) && !defined($opts{'s'})));

if (!$configfile && $ENV{'ARC_CONFIG'} && -e $ENV{'ARC_CONFIG'}) {
    $configfile = $ENV{'ARC_CONFIG'};
}

my %loglevels = (FATAL => 0, ERROR => 1, WARNING => 2, INFO => 3, VERBOSE => 4, DEBUG => 5);
die "Bad debug level $debuglevel\n" unless $loglevels{$debuglevel};
my $loglevel = $loglevels{$debuglevel};

# Prints message with required format
sub p($) {
    my $s = shift;
    my $t = strftime "%F %T", localtime;
    print STDERR "[$t] [cache-clean] $s\n";
}

# logging functions
sub error($) {
    if ($loglevel >= 1) {
        my $s = shift;
        p("[ERROR] $s");
    }
}

sub warning($) {
    if ($loglevel >= 2) {
        my $s = shift;
        p("[WARNING] $s");
    }
}

sub info($) {
    if ($loglevel >= 3) {
        my $s = shift;
        p("[INFO] $s");
    }
}

sub verbose($) {
    if ($loglevel >= 4) {
        my $s = shift;
        p("[VERBOSE] $s");
    }
}

info('Cache cleaning started');

my @caches = @ARGV;

if (!@caches) {
    die "No config file found and no caches specified\n" unless $configfile;
    die "No such configuration file $configfile\n" unless -e $configfile;

    open CONFIG, $configfile;
    my $cacheblock = 0;
    while (my $line = <CONFIG>) {
        if ($line =~ /^\[/) { $cacheblock = 0; }
        if ($line =~ /^\[arex\/cache\]/) { $cacheblock = 1; }
        next unless $cacheblock;
        if ($line =~ /^cachedir/) {
            $line =~ /\S+\s*=\s*([^\s]+)/;
            my $cache = $1;
            push @caches, $cache;
        }
    }
    close CONFIG;
    die "No caches found in config file '$configfile'\n" unless @caches;
}

foreach my $filesystem (@caches) {

    $filesystem =~ s|/+$|| unless $filesystem eq "/";
    next if $filesystem eq "";

    if ($filesystem =~ /%/) {
        warning("$filesystem: Warning: cache-clean cannot deal with substitutions");
        next;
    }

    if (! -d $filesystem || ! -d $filesystem."/data") {
        info("$filesystem: Cache is empty");
        next;
    }

    # follow sym links to real filesystem
    my $symlinkdest = $filesystem;
    while ($symlinkdest) {
        $filesystem = $symlinkdest;
        $symlinkdest = readlink($symlinkdest);
        $symlinkdest =~ s|/+$|| if $symlinkdest;
    }

    my $fsvalues = diskspace($filesystem);

    if(!($fsvalues)) {
        warning("Unable to stat $filesystem");
        next;
    }

    my $fssize = $fsvalues->{total};
    my $fsused = $fsvalues->{used};

    my $maxfbytes=$fssize*$maxusedpercent/100;
    info(join("", "$filesystem: used space ", printsize($fsused), " / ", printsize($fssize), " (", sprintf("%.2f",100*$fsused/$fssize), "%)"));
    if ($expirytime == 0 && $fsused < $maxfbytes && !$opts{'s'}) {
        info(join("", "Used space is lower than upper limit (", $maxusedpercent, "%)"));
        next;
    }

    my $minfbytes=$fssize*$minusedpercent/100;

    %files = ();
    %expiredfiles = ();
    $totsize = 0;
    $totlocksize = 0;
    $totlockfiles = 0;

    File::Find::find({wanted => \&wanted}, $filesystem."/data");

    if($opts{'s'}) {
        my %allfiles = (%files, %expiredfiles);
        print "\nUsage statistics: $filesystem\n";
        print "Total deletable files found: ",scalar keys %allfiles," ($totlockfiles files locked or in use)\n";
        print "Total size of deletable files found: ",printsize($totsize)," (",printsize($totlocksize)," locked or in use)\n";
        print "Used space on file system: ",printsize($fsused)," / ",printsize($fssize), " (",sprintf("%.2f",100*$fsused/$fssize),"%)\n";
        my $increment = $totsize / 10;
        if($increment < 1) {
            print "Total size too small to show usage histogram\n";
            next;
        }

        printf "%-21s %-25s %s\n", "At size (% of total)", "Newest file", "Oldest file";
        my $nextinc = $increment;
        my $accumulated = 0;
        my ($newatime, $lastatime);

        foreach my $fil (sort { $allfiles{$b}{atime} <=> $allfiles{$a}{atime} } 
              keys %allfiles) {
            $accumulated += $allfiles{$fil}{size};
            if(!$newatime) {
                $newatime = $allfiles{$fil}{atime};
            }

            if($accumulated > $nextinc) {
                printf "%-21s %-25s %s\n", 
                printsize($accumulated)." (".int(($accumulated/$totsize)*100)."%)",
                scalar localtime($newatime),
                scalar localtime($allfiles{$fil}{atime});
                while($nextinc < $accumulated) {
                    $nextinc += $increment;
                }
                $newatime = undef;
                $lastatime = undef;
            }
            else {
                $lastatime = $allfiles{$fil}{atime};
            }
        }
        printf "%-21s %-25s %s\n", 
        printsize($accumulated)." (100%)", "-",
        scalar localtime($lastatime) if($lastatime);
        next;
    }

    # remove expired files
    if ($expirytime > 0) {
        foreach my $fil (keys %expiredfiles) {
            next if (-d "$fil");
            if (-e "$fil.lock") {
                my $atime = (stat("$fil.lock"))[8];
                next if ( defined $atime && ($currenttime - $atime) <= 86400);
                unlink "$fil.lock";
            }

            if ( unlink $fil ) {
                $fsused-=$expiredfiles{$fil}{size};
                if (defined($opts{'D'}) && -e "$fil.meta") {
                    open FILE, "$fil.meta";
                    my @lines = <FILE>;
                    close FILE;
                    my @values = split(' ', $lines[0]);
                    verbose(join("", "Deleting expired file: $fil  atime: ", scalar localtime($expiredfiles{$fil}{atime}), "  size: $expiredfiles{$fil}{size}  url: $values[0]"));
                }
                else {
                    verbose(join("", "Deleting expired file: $fil  atime: ", scalar localtime($expiredfiles{$fil}{atime}), "  size: $expiredfiles{$fil}{size}"));
                }
            } else {
                warning("Error deleting file '$fil': $!");
            }

            # not critical if this fails
            next if (! -e "$fil.meta");
            if ( unlink "$fil.meta" ) {
                my $lastslash = rindex($fil, "/");
                if ( rmdir(substr($fil, 0, $lastslash))) {
                     verbose("Deleting directory ".substr($fil, 0, $lastslash));
                }
            } else {
                warning("Error deleting file '$fil.meta': $!");
            } 
        }
    }
    #Are we still exceding limit after deleting expired files.
    if ($fsused > $maxfbytes) {
        # delete in order of access
        foreach my $fil (sort { $files{$a}{atime} <=> $files{$b}{atime} } keys %files) {
            last if $fsused < $minfbytes;
            next if (-d "$fil");
            if (-e "$fil.lock") {
                my $atime = (stat("$fil.lock"))[8];
                next if ( defined $atime && ($currenttime - $atime) <= 86400);
                unlink "$fil.lock"
            }

            if ( unlink $fil ) {
                $fsused-=$files{$fil}{size};
                if (defined($opts{'D'}) && -e "$fil.meta") {
                    open FILE, "$fil.meta";
                    my $line = <FILE>;
                    close FILE;
                    chomp($line);
                    verbose(join("", "Deleting file: $fil  atime: ", scalar localtime($files{$fil}{atime}), "  size: $files{$fil}{size}  url: $line"));
                }
                else {
                    verbose(join("","Deleting file: $fil  atime: ", scalar localtime($files{$fil}{atime}), "  size: $files{$fil}{size}"));
                }
            } else {
                warning("Error deleting file '$fil': $!");
            }
            next if (! -e "$fil.meta");
            # not critical if this fails
            if ( unlink "$fil.meta" ) {
                my $lastslash = rindex($fil, "/");
                if ( rmdir(substr($fil, 0, $lastslash))) {
                    verbose("Deleting directory ".substr($fil, 0, $lastslash));
                }
            } else {
                warning("Error deleting file '$fil.meta': $!");
            } 
        }
    }
    info(join("", "Cleaning finished, used space now ", printsize($fsused), " / ", printsize($fssize), " (", sprintf("%.2f",100*$fsused/$fssize),"%)"));
}
exit 0;

sub wanted {

    return if $name =~ m|\.lock$|;
    return if $name =~ m|\.meta$|;

    my ($links, $atime, $blocks);

    ($links, $atime, $blocks) = (lstat($_))[3,8,12];

    return unless defined $atime;

    return unless !(-d _) || -f _;

    if ($links != 1) {
        $totlocksize += 512 * $blocks;
        $totlockfiles++;
        return;
    }

    if (-e "$name.lock") {
        # check if lock is still valid
        my $lockatime = (stat("$name.lock"))[8];
        if ( defined $lockatime && ($currenttime - $lockatime) <= 86400) {
            $totlocksize += 512 * $blocks;
            $totlockfiles++;
            return;
        }
    }

    if ($expirytime > 0 && ($currenttime - $atime) >= $expirytime) {
        $expiredfiles{$name}{atime}=$atime;
        $expiredfiles{$name}{size}= 512 * $blocks;
    }
    else {
        $files{$name}{atime}=$atime;
        $files{$name}{size}= 512 * $blocks;
    }
    $totsize += 512 * $blocks;
}

sub printsize($)
{
    my $size = shift;

    if($size > 1024*1024*1024*1024) {
        $size = int($size/(1024*1024*1024*1024));
        return "$size TB";
    }
    if($size > 1024*1024*1024) {
        $size = int($size/(1024*1024*1024));
        return "$size GB";
    }
    if($size > 1024*1024) {
        $size = int($size/(1024*1024));
        return "$size MB";
    }
    if($size > 1024) {
        $size = int($size/1024);
        return "$size kB";
    }

    return $size;
}

#
# Returns disk space (total, used by cache and free) in bytes on a filesystem
# Taken from arc1/trunk/src/services/a-rex/infoproviders/HostInfo.pm
# Updated to calculate actual use
# TODO: Put in common place
#
sub diskspace ($) {
    my $path = shift;
    my ($diskused, $disktotal);

    if ( -d "$path") {
        # user-specified tool
        if (defined($opts{f})) {
            my $spacecmd = $opts{f};
            my @output= `$spacecmd $path`;
            if ($? != 0) {
                warning("Failed running $spacecmd");
            } elsif ($output[0] =~ m/(\d+) (\d+)/) {
                $disktotal = $1;
                $diskused  = $2;
            } else {
                warning("Bad output from $spacecmd: @output");
            }
        }
        # check if on afs
        elsif ($path =~ m#/afs/#) {
            my @dfstring =`fs listquota $path 2>/dev/null`;
            if ($? != 0) {
                warning("Failed running: fs listquota $path");
            } elsif ($dfstring[-1] =~ /\s+(\d+)\s+(\d+)\s+\d+%\s+\d+%/) {
                $disktotal = $1*1024;
                $diskused  = $2*1024;
            } else {
                warning("Failed interpreting output of: fs listquota $path");
            }
        # "ordinary" disk
        } else {
            my @dfstring =`df -k $path 2>/dev/null`;
            if ($? != 0) {
                warning("Failed running: df -k $path");

            # The first column may be printed on a separate line.
            # The relevant numbers are always on the last line.
            } elsif ($dfstring[-1] =~ /\s+(\d+)\s+(\d+)\s+\d+\s+\d+%\s+\//) {
                $disktotal = $1*1024;
                $diskused  = $2*1024;
            } else {
                warning("Failed interpreting output of: df -k $path");
            }
        }
        # get actual used disk for caches on shared partitions if configured
        if (defined($opts{S})) {
            $diskused = undef;
            my @dustring =`du -ks $path 2>/dev/null`;
            if ($? != 0) {
                warning("Failed running: du -ks $path");
            } elsif ($dustring[-1] =~ /(\d+)\s+[\w\/]+/) {
                $diskused = $1*1024;
            } else {
                warning("Failed interpreting output of: du -ks $path");
            }
        }
    } else {
        warning("Not a directory: $path");
    }

    return undef unless defined($disktotal) and defined($diskused);
    return {total => $disktotal, used => $diskused};
}
