[system] / trunk / pg / lib / Regression.pm Repository:
ViewVC logotype

View of /trunk/pg/lib/Regression.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1050 - (download) (as text) (annotate)
Fri Jun 6 21:39:42 2003 UTC (16 years, 8 months ago) by sh002i
File size: 12849 byte(s)
moved PG modules and macro files from webwork-modperl to pg
-sam

    1 #!/usr/bin/perl -w
    2 
    3 package Regression;
    4 
    5 $VERSION = 0.1;
    6 
    7 use strict;
    8 
    9 ################################################################
   10 use constant TINY => 1e-8;
   11 use constant DEBUGGING => 0;
   12 ################################################################
   13 =pod
   14 
   15 =head1 NAME
   16 
   17   Regression.pm -     weighted linear regression package (line+plane fitting)
   18 
   19 =head1 DESCRIPTION
   20 
   21 Regression.pm is a multivariate linear regression package.  That is,
   22 it estimates the c coefficients for a line-fit of the type
   23 
   24 y= b(0)*x(0) + b(1)*x1 + b(2)*x2 + ... + b(k)*xk
   25 
   26 given a data set of N observations, each with k independent x variables and
   27 one y variable.  Naturally, N must be greater than k---and preferably
   28 considerably greater.  Any reasonable undergraduate statistics book will
   29 explain what a regression is.  Most of the time, the user will provide a
   30 constant ('1') as x(0) for each observation in order to allow the regression
   31 package to fit an intercept.
   32 
   33 =head1 USAGE
   34 
   35 If the sample data for (x1, x2, y) includes ($x1[$i], $x2[$i], $y[$i]) for 0<=$i<=5, type
   36 
   37 $reg = Regression->new( 3, "y", [ "const", "x1", "x2" ] );
   38 
   39 for($i=0; $i<6; $i++){
   40   $reg->include( $y[$i], [ 1.0, $x1[$i], $x2[$i] ] );
   41 }
   42 
   43 @coeff= $reg->theta();
   44 
   45 $b0 = $coeff[0][0];
   46 $b1 = $coeff[0][1];
   47 $b2 = $coeff[0][2];
   48 
   49 =head1 ALGORITHM
   50 
   51 =head2 Original Algorithm (ALGOL-60):
   52 
   53   W.  M.  Gentleman, University of Waterloo, "Basic Description
   54   For Large, Sparse Or Weighted Linear Least Squares Problems
   55   (Algorithm AS 75)," Applied Statistics (1974) Vol 23; No. 3
   56 
   57 =head2 INTERNALS
   58 
   59 R=Rbar is an upperright triangular matrix, kept in normalized
   60 form with implicit 1's on the diagonal.  D is a diagonal scaling
   61 matrix.  These correspond to "standard Regression usage" as
   62 
   63                 X' X  = R' D R
   64 
   65 A backsubsitution routine (in thetacov) allows to invert the R
   66 matrix (the inverse is upper-right triangular, too!). Call this
   67 matrix H, that is H=R^(-1).
   68 
   69     (X' X)^(-1) = [(R' D^(1/2)') (D^(1/2) R)]^(-1)
   70     = [ R^-1 D^(-1/2) ] [ R^-1 D^(-1/2) ]'
   71 
   72 
   73 
   74 =head2 Remarks
   75 
   76 This algorithm is the statistical "standard." Insertion of a new
   77 observation can be done one obs at any time (WITH A WEIGHT!),
   78 and still only takes a low quadratic time.  The storage space
   79 requirement is of quadratic order (in the indep variables). A
   80 practically infinite number of observations can easily be
   81 processed!
   82 
   83 =head1 AUTHOR
   84 
   85 Naturally, Gentleman invented this algorithm.  Adaptation by ivo
   86 welch. Alan Miller (alan@dmsmelb.mel.dms.CSIRO.AU) pointed out
   87 nicer ways to compute the R^2.
   88 
   89 =head1 Subroutines
   90 
   91 =cut
   92 ################################################################
   93 
   94 
   95 #### let's start with handling of missing data ("nan" or "NaN")
   96 
   97 my $nan= "NaN";
   98 sub isNaN {
   99   if ($_[0] !~ /[0-9nan]/) { die "definitely not a number in NaN: '$_[0]'"; }
  100   return ($_[0]=~ /NaN/i) || ($_[0] != $_[0]);
  101 }
  102 
  103 
  104 ################################################################
  105 
  106 =pod
  107 
  108 =head2 new
  109 
  110 receives the number of variables on each observations (i.e., an integer) and
  111 returns the blessed data structure.  Also takes an optional name for this
  112 regression to remember, as well as a reference to a k*1 array of names for
  113 the X coefficients.
  114 
  115 =cut
  116 
  117 ################################################################
  118 sub new {
  119   my $classname= shift(@_);
  120   my $K= shift(@_); # the number of variables
  121   my $regname= shift(@_) || "with no name";
  122 
  123   if (!defined($K)) { die "Regression->new needs at least one argument for the number of variables"; }
  124   if ($K<=1) { die "Cannot run a regression without at least two variables."; }
  125 
  126   sub zerovec {
  127     my @rv;
  128     for (my $i=0; $i<=$_[0]; ++$i) { $rv[$i]=0; }
  129     return \@rv;
  130   }
  131 
  132   bless {
  133    k => $K,
  134    regname => $regname,
  135    xnames => shift(@_),
  136 
  137    # constantly updated
  138    n => 0,
  139    sse => 0,
  140    syy => 0,
  141    sy => 0,
  142    wghtn => 0,
  143    d => zerovec($K),
  144    thetabar => zerovec($K),
  145    rbarsize => ($K+1)*$K/2+1,
  146    rbar => zerovec(($K+1)*$K/2+1),
  147 
  148    # other constants
  149    neverabort => 0,
  150 
  151    # computed on demand
  152    theta => undef,
  153    sigmasq => undef,
  154    rsq => undef,
  155    adjrsq => undef
  156   }, $classname;
  157 }
  158 
  159 ################################################################
  160 
  161 =pod
  162 
  163 =head2 dump
  164 
  165 is used for debugging.
  166 
  167 =cut
  168 
  169 ################################################################
  170 sub dump {
  171   my $this= $_[0];
  172   print "****************************************************************\n";
  173   print "Regression '$this->{regname}'\n";
  174   print "****************************************************************\n";
  175   sub print1val {
  176     no strict;
  177     print "$_[1]($_[2])=\t". ((defined($_[0]->{ $_[2] }) ? $_[0]->{ $_[2] } : "intentionally undef"));
  178 
  179     my $ref=$_[0]->{ $_[2] };
  180 
  181     if (ref($ref) eq 'ARRAY') {
  182       my $arrayref= $ref;
  183       print " $#$arrayref+1 elements:\n";
  184       if ($#$arrayref>30) {
  185   print "\t";
  186   for(my $i=0; $i<$#$arrayref+1; ++$i) { print "$i='$arrayref->[$i]';"; }
  187   print "\n";
  188       }
  189       else {
  190   for(my $i=0; $i<$#$arrayref+1; ++$i) { print "\t$i=\t'$arrayref->[$i]'\n"; }
  191       }
  192     }
  193     elsif (ref($ref) eq 'HASH') {
  194       my $hashref= $ref;
  195       print " ".scalar(keys(%$hashref))." elements\n";
  196       while (my ($key, $val) = each(%$hashref)) {
  197   print "\t'$key'=>'$val';\n";
  198       }
  199     }
  200     else {
  201       print " [was scalar]\n"; }
  202   }
  203 
  204   while (my ($key, $val) = each(%$this)) {
  205     $this->print1val($key, $key);
  206   }
  207   print "****************************************************************\n";
  208 }
  209 
  210 ################################################################
  211 =pod
  212 
  213 =head2 print
  214 
  215 prints the estimated coefficients, and R^2 and N.
  216 
  217 =cut
  218 ################################################################
  219 sub print {
  220   my $this= $_[0];
  221   print "****************************************************************\n";
  222   print "Regression '$this->{regname}'\n";
  223   print "****************************************************************\n";
  224 
  225   my $theta= $this->theta();
  226 
  227   for (my $i=0; $i< $this->k(); ++$i) {
  228     print "Theta[$i".(defined($this->{xnames}->[$i]) ? "='$this->{xnames}->[$i]'":"")."]= ".sprintf("%12.4f", $theta->[$i])."\n";
  229   }
  230   print "R^2= ".sprintf("%.3f", $this->rsq()).", N= ".$this->n()."\n";
  231   print "****************************************************************\n";
  232 }
  233 
  234 
  235 ################################################################
  236 =pod
  237 
  238 =head2 include
  239 
  240 receives one new observation.  Call is
  241 
  242   $blessedregr->include( $yvariable, [ $x1, $x2, $x3 ... $xk ], 1.0 );
  243 
  244 where 1.0 is an (optional) weight.  Note that inclusion with a
  245 weight of -1 can be used to delete an observation.
  246 
  247 The function returns the number of observations so far included.
  248 
  249 =cut
  250 ################################################################
  251 sub include {
  252   my $this = shift();
  253   my $yelement= shift();
  254   my $xrow= shift();
  255   my $weight= shift() || 1.0;
  256 
  257   # omit observations with missing observations;
  258   if (!defined($yelement)) { die "Internal Error: yelement is undef"; }
  259   if (isNaN($yelement)) { return $this->{n}; }
  260 
  261   my @xcopy;
  262   for (my $i=1; $i<=$this->{k}; ++$i) {
  263     if (!defined($xrow->[$i-1])) { die "Internal Error: xrow [ $i-1 ] is undef"; }
  264     if (isNaN($xrow->[$i-1])) { return $this->{n}; }
  265     $xcopy[$i]= $xrow->[$i-1];
  266   }
  267 
  268   $this->{syy}+= ($weight*($yelement*$yelement));
  269   $this->{sy}+= ($weight*($yelement));
  270   if ($weight>=0.0) { ++$this->{n}; } else { --$this->{n}; }
  271 
  272   $this->{wghtn}+= $weight;
  273 
  274   for (my $i=1; $i<=$this->{k};++$i) {
  275     if ($weight==0.0) { return $this->{n}; }
  276     if (abs($xcopy[$i])>(TINY)) {
  277       my $xi=$xcopy[$i];
  278 
  279       my $di=$this->{d}->[$i];
  280       my $dprimei=$di+$weight*($xi*$xi);
  281       my $cbar= $di/$dprimei;
  282       my $sbar= $weight*$xi/$dprimei;
  283       $weight*=($cbar);
  284       $this->{d}->[$i]=$dprimei;
  285       my $nextr=int( (($i-1)*( (2.0*$this->{k}-$i))/2.0+1) );
  286       if (!($nextr<=$this->{rbarsize}) ) { die "Internal Error 2"; }
  287       my $xk;
  288       for (my $kc=$i+1;$kc<=$this->{k};++$kc) {
  289   $xk=$xcopy[$kc]; $xcopy[$kc]=$xk-$xi*$this->{rbar}->[$nextr];
  290   $this->{rbar}->[$nextr]= $cbar * $this->{rbar}->[$nextr]+$sbar*$xk;
  291   ++$nextr;
  292       }
  293       $xk=$yelement; $yelement-= $xi*$this->{thetabar}->[$i];
  294       $this->{thetabar}->[$i]= $cbar*$this->{thetabar}->[$i]+$sbar*$xk;
  295     }
  296   }
  297   $this->{sse}+=$weight*($yelement*$yelement);
  298 
  299   # indicate that Theta is garbage now
  300   $this->{theta}= undef;
  301   $this->{sigmasq}= undef; $this->{rsq}= undef; $this->{adjrsq}= undef;
  302 
  303   return $this->{n};
  304 }
  305 
  306 
  307 
  308 ################################################################
  309 
  310 =pod
  311 
  312 =head2 theta
  313 
  314 estimates and returns the vector of coefficients.
  315 
  316 =cut
  317 ################################################################
  318 
  319 sub theta {
  320   my $this= shift();
  321 
  322   if (defined($this->{theta})) { return $this->{theta}; }
  323 
  324   if ($this->{n} < $this->{k}) { return undef; }
  325   for (my $i=($this->{k}); $i>=1; --$i) {
  326     $this->{theta}->[$i]= $this->{thetabar}->[$i];
  327     my $nextr= int (($i-1)*((2.0*$this->{k}-$i))/2.0+1);
  328     if (!($nextr<=$this->{rbarsize})) { die "Internal Error 3"; }
  329     for (my $kc=$i+1;$kc<=$this->{k};++$kc) {
  330       $this->{theta}->[$i]-=($this->{rbar}->[$nextr]*$this->{theta}->[$kc]);
  331       ++$nextr;
  332     }
  333   }
  334 
  335   my $ref= $this->{theta}; shift(@$ref); # we are counting from 0
  336 
  337   # if in a scalar context, otherwise please return the array directly
  338   return $this->{theta};
  339 }
  340 
  341 ################################################################
  342 =pod
  343 
  344 =head2 rsq, adjrsq, sigmasq, ybar, sst, k, n
  345 
  346 These functions provide common auxiliary information.  rsq, adjrsq,
  347 sigmasq, sst, and ybar have not been checked but are likely correct.
  348 The results are stored for later usage, although this is somewhat
  349 unnecessary because the computation is so simple anyway.
  350 
  351 =cut
  352 
  353 ################################################################
  354 
  355 sub rsq {
  356   my $this= shift();
  357   return $this->{rsq}= 1.0- $this->{sse} / $this->sst();
  358 }
  359 
  360 sub adjrsq {
  361   my $this= shift();
  362   return $this->{adjrsq}= 1.0- (1.0- $this->rsq())*($this->{n}-1)/($this->{n} - $this->{k});
  363 }
  364 
  365 sub sigmasq {
  366   my $this= shift();
  367   return $this->{sigmasq}= ($this->{n}<=$this->{k}) ? "Inf" : ($this->{sse}/($this->{n} - $this->{k}));
  368 }
  369 
  370 sub ybar {
  371   my $this= shift();
  372   return $this->{ybar}= $this->{sy}/$this->{wghtn};
  373 }
  374 
  375 sub sst {
  376   my $this= shift();
  377   return $this->{sst}= ($this->{syy} - $this->{wghtn}*($this->ybar())**2);
  378 }
  379 
  380 sub k {
  381   my $this= shift();
  382   return $this->{k};
  383 }
  384 sub n {
  385   my $this= shift();
  386   return $this->{n};
  387 }
  388 
  389 
  390 ################################################################
  391 =pod
  392 
  393 =head1 DEBUGGING = SAMPLE USAGE CODE
  394 
  395 The sample code included with this package demonstrates regression usage.
  396 To execute it, just set the constant DEBUGGING at the script head to 1, and
  397 do
  398 
  399   perl Regression.pm
  400 
  401 The printout should be
  402 
  403   ****************************************************************
  404   Regression 'sample regression'
  405   ****************************************************************
  406   Theta[0='const']=       0.2950
  407   Theta[1='someX']=       0.6723
  408   Theta[2='someY']=       1.0688
  409   R^2= 0.808, N= 4
  410   ****************************************************************
  411 
  412 =cut
  413 ################################################################
  414 
  415 if (DEBUGGING) {
  416   package main;
  417 
  418   my $reg= Statistics::Regression->new( 3, "sample regression", [ "const", "someX", "someY" ] );
  419   $reg->include( 2.0, [ 1.0, 3.0, -1.0 ] );
  420   $reg->include( 1.0, [ 1.0, 5.0, 2.0 ] );
  421   $reg->include( 20.0, [ 1.0, 31.0, 0.0 ] );
  422   $reg->include( 15.0, [ 1.0, 11.0, 2.0 ] );
  423 
  424 #  $reg->print();   or: my $coefs= $reg->theta(); print @coefs; print $reg->rsq;
  425 # my $coefs= $reg->theta(); print $coeff[0];
  426 }
  427 
  428 ################################################################
  429 =pod
  430 
  431 =head1 BUGS/PROBLEMS
  432 
  433 =over 4
  434 
  435 =item Missing
  436 
  437 This package lacks routines to compute the standard errors of
  438 the coefficients.  This requires access to a matrix inversion
  439 package, and I do not have one at my disposal.  If you want to
  440 add one, please let me know.
  441 
  442 =item Perl Problem
  443 
  444 perl is unaware of IEEE number representations.  This makes it a
  445 pain to test whether an observation contains any missing
  446 variables (coded as 'NaN' in Regression.pm).
  447 
  448 =item Others
  449 
  450 I am a novice perl programmer, so this is probably ugly code.  However, it
  451 does seem to work, and I could not find anything equivalent on cpan.
  452 
  453 =back
  454 
  455 =head1 INSTALLATION and DOCUMENTATION
  456 
  457 Installation consists of moving the file 'Regression.pm' into a subdirectory
  458 Statistics of your modules path (e.g., /usr/lib/perl5/site_perl/5.6.0/).
  459 
  460 The documentation was produced from the module:
  461 
  462 pod2html -noindex -title "perl weighted least squares regression package" Regression.pm > Regression.html
  463 
  464 The documentation was slightly modified by Maria Voloshina, University of Rochester.
  465 
  466 =head1 LICENSE
  467 
  468 This module is released for free public use under a GPL license.
  469 
  470 (C) Ivo Welch, 2001.
  471 
  472 =cut
  473 
  474 ################################################################
  475 1;

aubreyja at gmail dot com
ViewVC Help
Powered by ViewVC 1.0.9