#!/usr/local/bin/perl
package Statistics::TTest;
use strict;
use Carp;
use POSIX;
use vars qw($VERSION $AUTOLOAD);
use Statistics::Distributions qw (tdistr fdistr tprob fprob);
use Statistics::PointEstimation;

my %fields=
(
	's1'          =>undef,     #sample 1 
	's2'          =>undef,     #sample 2 
	'significance' =>undef,
	'alpha' =>undef,
	'mean_difference' =>undef,
	'variance' =>undef,
	'standard_deviation' =>undef,
	'standard_error' =>undef,
	'standard_error_equal' =>undef,
	'standard_error_unequal' =>undef,
	'f_cutoff' =>undef,
	'f_statistic' =>undef,
	'df1' =>undef,
	'df2' =>undef,
	'df' => undef,
	'df_equal' =>undef,
	'df_unequal' =>undef,
	'equal_variance'=>undef,
	't_value' =>undef,
	't_statistic' =>undef,
	't_statistic_equal' =>undef,
	't_statistic_unequal' =>undef,
	't_prob' =>undef,
	'null_hypothesis' =>undef,
	'delta' =>undef,
	'lower_clm' =>undef,
	'upper_clm' =>undef,
	'valid' =>undef
);


sub new
{
	my $proto = shift;
        my $class = ref($proto) || $proto;
	my $self= {%fields};
	bless($self,$class);
	return $self;
}

sub load_data
{
	my $self=shift;
	undef $self->{valid};
	my ($sample1,$sample2)=@_;
	if((ref $sample1 ne 'ARRAY')||(ref $sample2 ne 'ARRAY'))
	{
		croak "Invalid input type for load_data() function.\n the 2 inputs must be array references.";
	}

	my $s1= new Statistics::PointEstimation;
	my $s2= new Statistics::PointEstimation;

	if($self->significance())
	{
		$s1->set_significance($self->significance());
		$s2->set_significance($self->significance());
	}

	$s1->add_data($sample1);
	$s2->add_data($sample2);

	croak "Invalid sample size.  sample size for the 2 samples are",$s1->count," and ",$s2->count() ,
	      ". the sample size must be greater than 1" if(($s1->count() <=1)||($s2->count() <=1 ));
	$self->{'s1'}=$s1;
	$self->{'s2'}=$s2;

	$self->perform_t_test();
	return;


}

sub perform_t_test
{
	my $self=shift;
	my $s1=$self->s1();
	my $s2=$self->s2();

	$self->{significance}=95 if (!defined($self->{significance}));
	$self->{alpha}=(100-$self->{significance})/2;
        $self->{alpha}/=100;
	$self->{mean_difference}=$s1->mean()-$s2->mean();
	$self->{variance}=($s1->df()*$s1->variance()+$s2->df()*$s2->variance)/($s1->df()+$s2->df());
	$self->{standard_deviation}=sqrt($self->{variance});
	$self->{standard_error_equal}=sqrt(1/$s1->count()+1/$s2->count())*$self->{standard_deviation};
	$self->{standard_error_unequal}=sqrt(($s1->standard_error())**2+($s2->standard_error())**2);

	$self->{df_equal}=$s1->df()+$s2->df();
	$self->{df_unequal}= (($s1->standard_error())**2+($s2->standard_error())**2)**2
				/
			    (($s1->standard_error())**4/$s1->df()+($s2->standard_error())**4/$s2->df());

	$self->{t_statistic_equal}= abs ($self->{mean_difference}/$self->{standard_error_equal});
	$self->{t_statistic_unequal}= abs ($self->{mean_difference}/$self->{standard_error_unequal});
	my ($df1,$df2);
	if($s1->variance()>=$s2->variance())
	{
		$df1=$s1->df();
		$df2=$s2->df();
		$self->{f_statistic}=$s1->variance()/$s2->variance();

	}
	else
	{
	
		$df1=$s2->df();
		$df2=$s1->df();
		$self->{f_statistic}=$s2->variance()/$s1->variance();
	}
	($self->{df1},$self->{df2})=($df1,$df2);
	$self->{f_cutoff}=fdistr($df1,$df2,$self->{alpha});

	if($self->{f_statistic}<=$self->{f_cutoff})
	{ $self->{equal_variance}=1; }
	else
	{ $self->{equal_variance}=0; }


	if($self->{equal_variance})
	{
		$self->{standard_error}=$self->{standard_error_equal};
		$self->{t_statistic}=$self->{t_statistic_equal};
		$self->{df}=$self->{df_equal};
                                                                                                                                                                                                                                                                                                                                      
	}
	else
	{
		$self->{standard_error}=$self->{standard_error_unequal};
		$self->{t_statistic}=$self->{t_statistic_unequal};
		$self->{df}=$self->{df_unequal};

	}
	$self->{t_prob}=1- abs (tprob(floor($self->{df}),-1*$self->{t_statistic})-tprob(floor($self->{df}),$self->{t_statistic}));
	if($self->{t_prob}<$self->{alpha}*2)
	{
		$self->{null_hypothesis}='rejected';
	}	
	else
	{
		$self->{null_hypothesis}='not rejected';
	}

	$self->{t_value}=tdistr(floor($self->df()),$self->alpha());
	$self->{delta}=$self->t_value()*$self->standard_error();
	$self->{lower_clm}=$self->{mean_difference}-$self->{delta};
	$self->{upper_clm}=$self->{mean_difference}+$self->{delta};
	$self->{valid}=1;
	return;

}

sub print_t_test
{
	my $self=shift;
	foreach my $k ( keys %$self)
        {
		next if ($k eq 's1') || ($k eq 's2'); 
                print "$k: $self->{$k} \n";
        }
        return 1;

}
sub output_t_test
{
	my $self=shift;
	my $s1=$self->{s1};
	my $s2=$self->{s2};
	croak "no data. s1 or s2 is empty.\n" if ((!defined($s1))||(!defined($s2))||($self->valid!=1));
	print "*****************************************************\n\n";
	$s1->output_confidence_interval('1');
	print "*****************************************************\n\n";
	$s2->output_confidence_interval('2');
	print "*****************************************************\n\n";

        print "Comparison of these 2 independent samples.\n";
        print "\t F-statistic=",$self->f_statistic()," , cutoff F-statistic=",$self->f_cutoff(),
		" with alpha level=",$self->alpha*2," and  df =(",$self->df1,",",$self->df2,")\n"; 
	if($self->{equal_variance})
	{ print "\tequal variance assumption is accepted(not rejected) since F-statistic < cutoff F-statistic\n";}
	else
	{ print "\tequal variance assumption is  rejected since F-statistic > cutoff F-statistic\n";}

	print "\tdegree of freedom=",$self->df," , t-statistic=T=",$self->t_statistic," Prob >|T|=",$self->{t_prob},"\n";
	print "\tthe null hypothesis (the 2 samples have the same mean) is ",$self->null_hypothesis(),
		 " since the alpha level is ",$self->alpha()*2,"\n";
	print "\tdifference of the mean=",$self->mean_difference(),", standard error=",$self->standard_error(),"\n";
	print "\t the estimate of the difference of the mean is ", $self->mean_difference()," +/- ",$self->delta(),"\n\t",
                " or (",$self->lower_clm()," to ",$self->upper_clm," ) with ",$self->significance," % of confidence\n"; 

	return 1;
}
sub set_significance
{
	my $self=shift;
        my $significance=shift;
        croak "Invalid Significance. $significance should be 0-100 usually 90,95,99\n"
		 unless (($significance>0)&&($significance<100));
	$self->{significance}=$significance;
        if($self->{s1}&&$self->{s2})
	{
		$self->{s1}->set_significance($significance);
		$self->{s2}->set_significance($significance);
		$self->perform_t_test();

	}
        return 1;
	
}

sub AUTOLOAD
{
	my $self = shift;
  	my $type = ref($self)
    	or croak "$self is not an object";
  	my $name = $AUTOLOAD;
  	$name =~ s/.*://;     ##Strip fully qualified-package portion
  	return if $name eq "DESTROY";
  	unless (exists $self->{$name} )
	{
    		croak "Can't access `$name' field in class $type";
  	}
	 ##Read only method 
	 return $self->{$name};
}

1;
__END__

=head1 NAME

 Statistics::TTest - Perl module to perform T-test on 2 independent samples


=head1 SYNOPSIS

 use Statistics::PointEstimation;
 use Statistics::TTest;
 my @r1=();
 my @r2=();
 my $rand;
        
  for($i=1;$i<=32;$i++) #generate a uniformly distributed sample with mean=5   
  {

          $rand=rand(10);
          push @r1,$rand;
          $rand=rand(10)-2;
          push @r2,$rand;
  }


 my $ttest = new Statistics::TTest;  
 $ttest->set_significance(90);
 $ttest->load_data(\@r1,\@r2);  
 $ttest->output_t_test();   
 $ttest->set_significance(99);
 $ttest->print_t_test();  #list out t-test related data
  
 #the following thes same as calling output_t_test()
        my $s1=$ttest->{s1};  #sample 1  a Statistics::PointEstimation object
        my $s2=$ttest->{s2};  #sample 2  a Statistics::PointEstimation object
        print "*****************************************************\n\n";
        $s1->output_confidence_interval('1');
        print "*****************************************************\n\n";
        $s2->output_confidence_interval('2');
        print "*****************************************************\n\n";

        print "Comparison of these 2 independent samples.\n";
        print "\t F-statistic=",$ttest->f_statistic()," , cutoff F-statistic=",$ttest->f_cutoff(),
                " with alpha level=",$ttest->alpha*2," and  df =(",$ttest->df1,",",$ttest->df2,")\n"; 
        if($ttest->{equal_variance})
        { print "\tequal variance assumption is accepted(not rejected) since F-statistic < cutoff F-statistic\n";}
        else
        { print "\tequal variance assumption is  rejected since F-statistic > cutoff F-statistic\n";}

        print "\tdegree of freedom=",$ttest->df," , t-statistic=T=",$ttest->t_statistic," Prob >|T|=",$ttest->{t_prob},"\n";
        print "\tthe null hypothesis (the 2 samples have the same mean) is ",$ttest->null_hypothesis(),
                 " since the alpha level is ",$ttest->alpha()*2,"\n";
        print "\tdifference of the mean=",$ttest->mean_difference(),", standard error=",$ttest->standard_error(),"\n";
        print "\t the estimate of the difference of the mean is ", $ttest->mean_difference()," +/- ",$ttest->delta(),"\n\t",
                " or (",$ttest->lower_clm()," to ",$ttest->upper_clm," ) with ",$ttest->significance," % of confidence\n"; 
  



=head1 DESCRIPTION

This is the Statistical T-Test module to compare 2 independent samples. It takes 2 array of point measures, compute the confidence
 intervals using the PointEstimation module (which is also included in this package) and use the T-statistic to test the null hypothesis. If the null hypothesis is rejected, the difference will be given as the lower_clm and upper_clm of the TTest object. 

=head1 AUTHOR

Yun-Fang Juan , Yahoo! Inc.  (yunfang@yahoo-inc.com)

=head1 SEE ALSO

Statistics::Descriptive Statistics::Distributions Statistics::PointEstimation

=cut 

