=head1 NAME

iPE::Model::DurationDistribution::GEOMETRIC

=head1 DESCRIPTION

Describes a geometric length distribution.  This takes in examples of features and finds the average.

=cut

package iPE::Model::DurationDistribution::GEOMETRIC;

use iPE;
use iPE::Globals;
use base ("iPE::Model::DurationDistribution");
use strict;


sub init {
    my ($this) = @_;

    # Using the pseudocount smoother doesn't make a lot of sense for geometric
    # tails, since the distribution goes out to infinity.  Inform the user
    # of his/her indiscretion if it is being used.
    if(ref($this->{pseudocountSmoother_}) eq 
            "iPE::Smoother::Pseudocount" && $this->max eq "L") {
        Warn(__PACKAGE__.": WARNING: Using a pseudocount smoother on a\n".
            "GEOMETRIC tail distribution does not make a lot of sense ".
            "since it adds counts\nto every bucket in the distribution. ".
            "Consider using the priorcounts feature, which adds a mass\n".
            "to the entire distribution.\n");
    }
    $this->{counts_} = {};
    $this->{mean_} = 0;
}

sub counts { shift->{counts_}   }
sub mean   { shift->{mean_}     }

sub countFeature {
    my ($this, $feature, $wt) = @_;
    my $length = int ($feature->length/$this->{lengthUnit_});
    if(!defined $this->{counts_}->{$length}) {
        $this->{counts_}->{$length} = $wt;
    }
    else {
        $this->{counts_}->{$length} += $wt;
    }
}

sub smooth {
    my ($this) = @_;
    # pseudocounting doesn't make a lot of sense for a distribution
    # undefined at most points, but in order to accommodate this,
    # the points must be defined in between.
    my $max = 0;
    if($this->max eq "L") {
        my @buckets = sort { $a <=> $b } keys %{$this->counts};
        $max = $buckets[$#buckets];
        # this will be undef'ed if no samples are present
    }
    else { $max = $this->max; }

    return unless(defined($max));

    for (my $i = $this->min; $i <= $max; $i++) {
        if(!defined($this->counts->{$i})) { $this->counts->{$i} = 0 }
    }

    $this->pseudocountSmoother->smoothHref($this->counts, $this->samples);
    $this->smoother->smoothHref($this->counts, $this->samples);
}

sub normalize {
    my ($this) = @_;

    if($this->density) {
        my $samples = $this->samples/$this->density;
        my $counts = 0;
        for my $len (keys (%{$this->{counts_}})) {
            $counts += $this->{counts_}->{$len}*$len;
        }
        $this->{mean_} = ($counts/$samples-$this->min+1)*$this->{lengthUnit_};

        # set the value of the PDF at the end of this part of the distribution.
        if($this->max ne "L") {
            $this->finalProb((1/$this->{mean_})*exp($this->max/$this->{mean_}));
        }
    }
    else {
        Warn("WARNING: scoring GEOMETRIC duration $this->{region_} with mean 0 ".
            "because no samples were found.\n");
        $this->finalProb(-1);
    }

}

sub match_with_density {
    my ($this) = @_;

    if($this->initialProb == -1) {
        Warn(__PACKAGE__." WARNING: no way to match $this->{region_} to its ".
            "previous region.\n(Does one exist?)  Setting mean to 0.\n");
        return;
    }
    elsif($this->initialProb == 0) {
        Warn(__PACKAGE__." WARNING: region previous to $this->{region_} ended ".
            "with probability 0.\n");
        return;
    }
    elsif($this->density == 0) {
        Warn(__PACKAGE__." WARNING: no samples for range $this->{region_} from ".
            "from ".$this->min." to ".$this->max.".\nSetting mean to 0.\n");
        return;
    }

    if($this->max ne "L") {
        # solve the transcendental equation that gives us lambda.
        # see doc or Randy Brown's notes for the details on the derivation
        my $MAX_LAMBDA = 1000000;
        my $min_diff = -1;
        my $best_lambda = -1;
        my ($lhs, $rhs, $diff);
        my $lambda = 1;

        # set these variables for a boost in performance (fewer function calls).
        my $density = $this->density;
        my $initialProb = $this->initialProb;
        my $minmaxdiff = ($this->min - 1) - $this->max;
        for($lambda = 1; $lambda <= $MAX_LAMBDA; $lambda++) {
            $lhs = $initialProb*(1-exp($minmaxdiff/$lambda));
            $rhs = $density/$lambda;
            $diff = abs($lhs-$rhs);
            if($diff  < $min_diff  || $min_diff == -1) {
                ($min_diff, $best_lambda) = ($diff, $lambda);
            }
        }

        # if we get a lambda at the lowerbound or the upperbound, then we
        # probably got the wrong answer for some reason.  
        # as a fallback, we use this section as the entire tail, since that
        # can always be fit.  we must set the density to the remaining
        # density of the PDF, and the final probability to -1, to signal
        # that this piece is the last one.
        if($best_lambda == 1 || $best_lambda >= $MAX_LAMBDA) {
            Warn(__PACKAGE__." WARNING: No reasonable curve could be fit\n ".
                "between ".$this->min." and ".$this->max." for ".$this->region.
                " distribution.  This will be the last tail in the PDF.\n");
            $this->finalProb(-1);
            # now this piece is the entire tail, so it is responsible for the
            # rest of the density.
            $this->density(1-$this->cumDensity);
            #fall through to the default desnity to infinity.
        }
        else {
            $this->{mean_} = $best_lambda;
            $this->{s_} = $best_lambda*$this->initialProb
                *exp(($this->min - 1)/$best_lambda);
            # set the value of the PDF at the end 
            # of this part of the distribution.
            $this->finalProb(($this->{s_}/$this->{mean_})
                *exp(-1*($this->max/$this->{mean_})));
            return;
        }
    }

    # we want to solve the exponential PDF for lambda when the density
    # is the given density and the probability of min-1 is initiaProb
    # this results in the following:
    $this->{mean_} = $this->density/$this->initialProb;
    $this->{s_}    = 
        exp(($this->min - 1)/$this->{mean_})*
            ($this->{mean_}*$this->initialProb);
}

# no need to implement score; the mean is found in the normalization step.
sub score { }

sub outputPrepare {
    my ($this, $out, $mode) = @_;

    my $pstring = "";    
    if($mode eq "score" || $mode eq "prob") {
        $pstring = $out->intf($this->mean);
        if(defined($this->{s_})) {
            $pstring .= "\n".$out->floatf($this->{s_});
        }
    }
    else {
        my $line = 0;
        for my $len (sort { $a <=> $b } keys (%{$this->counts})) {
            next if $this->counts->{$len} == 0;
            if($line%4 == 0 && $line) { $pstring .= "\n"; $line = 0 }
            $pstring .= $out->intf($len).": ".
                $out->floatf($this->counts->{$len}).$out->tab;
            $line++;
        }
    }

    $this->setParamString($pstring);
}

=head1 SEE ALSO

L<iPE::Model::DurationDistribution::GEOMETRIC>

=head1 AUTHOR

Bob Zimmermann (rpz@cse.wustl.edu)

=cut
1;
