Parent Directory
|
Revision Log
Revision 79 - (view) (download) (as text)
| 1 : | maria | 79 | #!/usr/bin/perl -w |
| 2 : | |||
| 3 : | package Regression; | ||
| 4 : | |||
| 5 : | $VERSION = 0.1; | ||
| 6 : | |||
| 7 : | use strict; | ||
| 8 : | |||
| 9 : | ################################################################ | ||
| 10 : | use constant TINY => 1e-8; | ||
| 11 : | use constant DEBUGGING => 0; | ||
| 12 : | ################################################################ | ||
| 13 : | =pod | ||
| 14 : | |||
| 15 : | =head1 NAME | ||
| 16 : | |||
| 17 : | Regression.pm - weighted linear regression package (line+plane fitting) | ||
| 18 : | |||
| 19 : | =head1 DESCRIPTION | ||
| 20 : | |||
| 21 : | Regression.pm is a multivariate linear regression package. That is, | ||
| 22 : | it estimates the c coefficients for a line-fit of the type | ||
| 23 : | |||
| 24 : | y= b(0)*x(0) + b(1)*x1 + b(2)*x2 + ... + b(k)*xk | ||
| 25 : | |||
| 26 : | given a data set of N observations, each with k independent x variables and | ||
| 27 : | one y variable. Naturally, N must be greater than k---and preferably | ||
| 28 : | considerably greater. Any reasonable undergraduate statistics book will | ||
| 29 : | explain what a regression is. Most of the time, the user will provide a | ||
| 30 : | constant ('1') as x(0) for each observation in order to allow the regression | ||
| 31 : | package to fit an intercept. | ||
| 32 : | |||
| 33 : | =head1 USAGE | ||
| 34 : | |||
| 35 : | If the sample data for (x1, x2, y) includes ($x1[$i], $x2[$i], $y[$i]) for 0<=$i<=5, type | ||
| 36 : | |||
| 37 : | $reg = Regression->new( 3, "y", [ "const", "x1", "x2" ] ); | ||
| 38 : | |||
| 39 : | for($i=0; $i<6; $i++){ | ||
| 40 : | $reg->include( $y[$i], [ 1.0, $x1[$i], $x2[$i] ] ); | ||
| 41 : | } | ||
| 42 : | |||
| 43 : | @coeff= $reg->theta(); | ||
| 44 : | |||
| 45 : | $b0 = $coeff[0][0]; | ||
| 46 : | $b1 = $coeff[0][1]; | ||
| 47 : | $b2 = $coeff[0][2]; | ||
| 48 : | |||
| 49 : | =head1 ALGORITHM | ||
| 50 : | |||
| 51 : | =head2 Original Algorithm (ALGOL-60): | ||
| 52 : | |||
| 53 : | W. M. Gentleman, University of Waterloo, "Basic Description | ||
| 54 : | For Large, Sparse Or Weighted Linear Least Squares Problems | ||
| 55 : | (Algorithm AS 75)," Applied Statistics (1974) Vol 23; No. 3 | ||
| 56 : | |||
| 57 : | =head2 INTERNALS | ||
| 58 : | |||
| 59 : | R=Rbar is an upperright triangular matrix, kept in normalized | ||
| 60 : | form with implicit 1's on the diagonal. D is a diagonal scaling | ||
| 61 : | matrix. These correspond to "standard Regression usage" as | ||
| 62 : | |||
| 63 : | X' X = R' D R | ||
| 64 : | |||
| 65 : | A backsubsitution routine (in thetacov) allows to invert the R | ||
| 66 : | matrix (the inverse is upper-right triangular, too!). Call this | ||
| 67 : | matrix H, that is H=R^(-1). | ||
| 68 : | |||
| 69 : | (X' X)^(-1) = [(R' D^(1/2)') (D^(1/2) R)]^(-1) | ||
| 70 : | = [ R^-1 D^(-1/2) ] [ R^-1 D^(-1/2) ]' | ||
| 71 : | |||
| 72 : | |||
| 73 : | |||
| 74 : | =head2 Remarks | ||
| 75 : | |||
| 76 : | This algorithm is the statistical "standard." Insertion of a new | ||
| 77 : | observation can be done one obs at any time (WITH A WEIGHT!), | ||
| 78 : | and still only takes a low quadratic time. The storage space | ||
| 79 : | requirement is of quadratic order (in the indep variables). A | ||
| 80 : | practically infinite number of observations can easily be | ||
| 81 : | processed! | ||
| 82 : | |||
| 83 : | =head1 AUTHOR | ||
| 84 : | |||
| 85 : | Naturally, Gentleman invented this algorithm. Adaptation by ivo | ||
| 86 : | welch. Alan Miller (alan@dmsmelb.mel.dms.CSIRO.AU) pointed out | ||
| 87 : | nicer ways to compute the R^2. | ||
| 88 : | |||
| 89 : | =head1 Subroutines | ||
| 90 : | |||
| 91 : | =cut | ||
| 92 : | ################################################################ | ||
| 93 : | |||
| 94 : | |||
| 95 : | #### let's start with handling of missing data ("nan" or "NaN") | ||
| 96 : | |||
| 97 : | my $nan= "NaN"; | ||
| 98 : | sub isNaN { | ||
| 99 : | if ($_[0] !~ /[0-9nan]/) { die "definitely not a number in NaN: '$_[0]'"; } | ||
| 100 : | return ($_[0]=~ /NaN/i) || ($_[0] != $_[0]); | ||
| 101 : | } | ||
| 102 : | |||
| 103 : | |||
| 104 : | ################################################################ | ||
| 105 : | |||
| 106 : | =pod | ||
| 107 : | |||
| 108 : | =head2 new | ||
| 109 : | |||
| 110 : | receives the number of variables on each observations (i.e., an integer) and | ||
| 111 : | returns the blessed data structure. Also takes an optional name for this | ||
| 112 : | regression to remember, as well as a reference to a k*1 array of names for | ||
| 113 : | the X coefficients. | ||
| 114 : | |||
| 115 : | =cut | ||
| 116 : | |||
| 117 : | ################################################################ | ||
| 118 : | sub new { | ||
| 119 : | my $classname= shift(@_); | ||
| 120 : | my $K= shift(@_); # the number of variables | ||
| 121 : | my $regname= shift(@_) || "with no name"; | ||
| 122 : | |||
| 123 : | if (!defined($K)) { die "Regression->new needs at least one argument for the number of variables"; } | ||
| 124 : | if ($K<=1) { die "Cannot run a regression without at least two variables."; } | ||
| 125 : | |||
| 126 : | sub zerovec { | ||
| 127 : | my @rv; | ||
| 128 : | for (my $i=0; $i<=$_[0]; ++$i) { $rv[$i]=0; } | ||
| 129 : | return \@rv; | ||
| 130 : | } | ||
| 131 : | |||
| 132 : | bless { | ||
| 133 : | k => $K, | ||
| 134 : | regname => $regname, | ||
| 135 : | xnames => shift(@_), | ||
| 136 : | |||
| 137 : | # constantly updated | ||
| 138 : | n => 0, | ||
| 139 : | sse => 0, | ||
| 140 : | syy => 0, | ||
| 141 : | sy => 0, | ||
| 142 : | wghtn => 0, | ||
| 143 : | d => zerovec($K), | ||
| 144 : | thetabar => zerovec($K), | ||
| 145 : | rbarsize => ($K+1)*$K/2+1, | ||
| 146 : | rbar => zerovec(($K+1)*$K/2+1), | ||
| 147 : | |||
| 148 : | # other constants | ||
| 149 : | neverabort => 0, | ||
| 150 : | |||
| 151 : | # computed on demand | ||
| 152 : | theta => undef, | ||
| 153 : | sigmasq => undef, | ||
| 154 : | rsq => undef, | ||
| 155 : | adjrsq => undef | ||
| 156 : | }, $classname; | ||
| 157 : | } | ||
| 158 : | |||
| 159 : | ################################################################ | ||
| 160 : | |||
| 161 : | =pod | ||
| 162 : | |||
| 163 : | =head2 dump | ||
| 164 : | |||
| 165 : | is used for debugging. | ||
| 166 : | |||
| 167 : | =cut | ||
| 168 : | |||
| 169 : | ################################################################ | ||
| 170 : | sub dump { | ||
| 171 : | my $this= $_[0]; | ||
| 172 : | print "****************************************************************\n"; | ||
| 173 : | print "Regression '$this->{regname}'\n"; | ||
| 174 : | print "****************************************************************\n"; | ||
| 175 : | sub print1val { | ||
| 176 : | no strict; | ||
| 177 : | print "$_[1]($_[2])=\t". ((defined($_[0]->{ $_[2] }) ? $_[0]->{ $_[2] } : "intentionally undef")); | ||
| 178 : | |||
| 179 : | my $ref=$_[0]->{ $_[2] }; | ||
| 180 : | |||
| 181 : | if (ref($ref) eq 'ARRAY') { | ||
| 182 : | my $arrayref= $ref; | ||
| 183 : | print " $#$arrayref+1 elements:\n"; | ||
| 184 : | if ($#$arrayref>30) { | ||
| 185 : | print "\t"; | ||
| 186 : | for(my $i=0; $i<$#$arrayref+1; ++$i) { print "$i='$arrayref->[$i]';"; } | ||
| 187 : | print "\n"; | ||
| 188 : | } | ||
| 189 : | else { | ||
| 190 : | for(my $i=0; $i<$#$arrayref+1; ++$i) { print "\t$i=\t'$arrayref->[$i]'\n"; } | ||
| 191 : | } | ||
| 192 : | } | ||
| 193 : | elsif (ref($ref) eq 'HASH') { | ||
| 194 : | my $hashref= $ref; | ||
| 195 : | print " ".scalar(keys(%$hashref))." elements\n"; | ||
| 196 : | while (my ($key, $val) = each(%$hashref)) { | ||
| 197 : | print "\t'$key'=>'$val';\n"; | ||
| 198 : | } | ||
| 199 : | } | ||
| 200 : | else { | ||
| 201 : | print " [was scalar]\n"; } | ||
| 202 : | } | ||
| 203 : | |||
| 204 : | while (my ($key, $val) = each(%$this)) { | ||
| 205 : | $this->print1val($key, $key); | ||
| 206 : | } | ||
| 207 : | print "****************************************************************\n"; | ||
| 208 : | } | ||
| 209 : | |||
| 210 : | ################################################################ | ||
| 211 : | =pod | ||
| 212 : | |||
| 213 : | =head2 print | ||
| 214 : | |||
| 215 : | prints the estimated coefficients, and R^2 and N. | ||
| 216 : | |||
| 217 : | =cut | ||
| 218 : | ################################################################ | ||
| 219 : | sub print { | ||
| 220 : | my $this= $_[0]; | ||
| 221 : | print "****************************************************************\n"; | ||
| 222 : | print "Regression '$this->{regname}'\n"; | ||
| 223 : | print "****************************************************************\n"; | ||
| 224 : | |||
| 225 : | my $theta= $this->theta(); | ||
| 226 : | |||
| 227 : | for (my $i=0; $i< $this->k(); ++$i) { | ||
| 228 : | print "Theta[$i".(defined($this->{xnames}->[$i]) ? "='$this->{xnames}->[$i]'":"")."]= ".sprintf("%12.4f", $theta->[$i])."\n"; | ||
| 229 : | } | ||
| 230 : | print "R^2= ".sprintf("%.3f", $this->rsq()).", N= ".$this->n()."\n"; | ||
| 231 : | print "****************************************************************\n"; | ||
| 232 : | } | ||
| 233 : | |||
| 234 : | |||
| 235 : | ################################################################ | ||
| 236 : | =pod | ||
| 237 : | |||
| 238 : | =head2 include | ||
| 239 : | |||
| 240 : | receives one new observation. Call is | ||
| 241 : | |||
| 242 : | $blessedregr->include( $yvariable, [ $x1, $x2, $x3 ... $xk ], 1.0 ); | ||
| 243 : | |||
| 244 : | where 1.0 is an (optional) weight. Note that inclusion with a | ||
| 245 : | weight of -1 can be used to delete an observation. | ||
| 246 : | |||
| 247 : | The function returns the number of observations so far included. | ||
| 248 : | |||
| 249 : | =cut | ||
| 250 : | ################################################################ | ||
| 251 : | sub include { | ||
| 252 : | my $this = shift(); | ||
| 253 : | my $yelement= shift(); | ||
| 254 : | my $xrow= shift(); | ||
| 255 : | my $weight= shift() || 1.0; | ||
| 256 : | |||
| 257 : | # omit observations with missing observations; | ||
| 258 : | if (!defined($yelement)) { die "Internal Error: yelement is undef"; } | ||
| 259 : | if (isNaN($yelement)) { return $this->{n}; } | ||
| 260 : | |||
| 261 : | my @xcopy; | ||
| 262 : | for (my $i=1; $i<=$this->{k}; ++$i) { | ||
| 263 : | if (!defined($xrow->[$i-1])) { die "Internal Error: xrow [ $i-1 ] is undef"; } | ||
| 264 : | if (isNaN($xrow->[$i-1])) { return $this->{n}; } | ||
| 265 : | $xcopy[$i]= $xrow->[$i-1]; | ||
| 266 : | } | ||
| 267 : | |||
| 268 : | $this->{syy}+= ($weight*($yelement*$yelement)); | ||
| 269 : | $this->{sy}+= ($weight*($yelement)); | ||
| 270 : | if ($weight>=0.0) { ++$this->{n}; } else { --$this->{n}; } | ||
| 271 : | |||
| 272 : | $this->{wghtn}+= $weight; | ||
| 273 : | |||
| 274 : | for (my $i=1; $i<=$this->{k};++$i) { | ||
| 275 : | if ($weight==0.0) { return $this->{n}; } | ||
| 276 : | if (abs($xcopy[$i])>(TINY)) { | ||
| 277 : | my $xi=$xcopy[$i]; | ||
| 278 : | |||
| 279 : | my $di=$this->{d}->[$i]; | ||
| 280 : | my $dprimei=$di+$weight*($xi*$xi); | ||
| 281 : | my $cbar= $di/$dprimei; | ||
| 282 : | my $sbar= $weight*$xi/$dprimei; | ||
| 283 : | $weight*=($cbar); | ||
| 284 : | $this->{d}->[$i]=$dprimei; | ||
| 285 : | my $nextr=int( (($i-1)*( (2.0*$this->{k}-$i))/2.0+1) ); | ||
| 286 : | if (!($nextr<=$this->{rbarsize}) ) { die "Internal Error 2"; } | ||
| 287 : | my $xk; | ||
| 288 : | for (my $kc=$i+1;$kc<=$this->{k};++$kc) { | ||
| 289 : | $xk=$xcopy[$kc]; $xcopy[$kc]=$xk-$xi*$this->{rbar}->[$nextr]; | ||
| 290 : | $this->{rbar}->[$nextr]= $cbar * $this->{rbar}->[$nextr]+$sbar*$xk; | ||
| 291 : | ++$nextr; | ||
| 292 : | } | ||
| 293 : | $xk=$yelement; $yelement-= $xi*$this->{thetabar}->[$i]; | ||
| 294 : | $this->{thetabar}->[$i]= $cbar*$this->{thetabar}->[$i]+$sbar*$xk; | ||
| 295 : | } | ||
| 296 : | } | ||
| 297 : | $this->{sse}+=$weight*($yelement*$yelement); | ||
| 298 : | |||
| 299 : | # indicate that Theta is garbage now | ||
| 300 : | $this->{theta}= undef; | ||
| 301 : | $this->{sigmasq}= undef; $this->{rsq}= undef; $this->{adjrsq}= undef; | ||
| 302 : | |||
| 303 : | return $this->{n}; | ||
| 304 : | } | ||
| 305 : | |||
| 306 : | |||
| 307 : | |||
| 308 : | ################################################################ | ||
| 309 : | |||
| 310 : | =pod | ||
| 311 : | |||
| 312 : | =head2 theta | ||
| 313 : | |||
| 314 : | estimates and returns the vector of coefficients. | ||
| 315 : | |||
| 316 : | =cut | ||
| 317 : | ################################################################ | ||
| 318 : | |||
| 319 : | sub theta { | ||
| 320 : | my $this= shift(); | ||
| 321 : | |||
| 322 : | if (defined($this->{theta})) { return $this->{theta}; } | ||
| 323 : | |||
| 324 : | if ($this->{n} < $this->{k}) { return undef; } | ||
| 325 : | for (my $i=($this->{k}); $i>=1; --$i) { | ||
| 326 : | $this->{theta}->[$i]= $this->{thetabar}->[$i]; | ||
| 327 : | my $nextr= int (($i-1)*((2.0*$this->{k}-$i))/2.0+1); | ||
| 328 : | if (!($nextr<=$this->{rbarsize})) { die "Internal Error 3"; } | ||
| 329 : | for (my $kc=$i+1;$kc<=$this->{k};++$kc) { | ||
| 330 : | $this->{theta}->[$i]-=($this->{rbar}->[$nextr]*$this->{theta}->[$kc]); | ||
| 331 : | ++$nextr; | ||
| 332 : | } | ||
| 333 : | } | ||
| 334 : | |||
| 335 : | my $ref= $this->{theta}; shift(@$ref); # we are counting from 0 | ||
| 336 : | |||
| 337 : | # if in a scalar context, otherwise please return the array directly | ||
| 338 : | return $this->{theta}; | ||
| 339 : | } | ||
| 340 : | |||
| 341 : | ################################################################ | ||
| 342 : | =pod | ||
| 343 : | |||
| 344 : | =head2 rsq, adjrsq, sigmasq, ybar, sst, k, n | ||
| 345 : | |||
| 346 : | These functions provide common auxiliary information. rsq, adjrsq, | ||
| 347 : | sigmasq, sst, and ybar have not been checked but are likely correct. | ||
| 348 : | The results are stored for later usage, although this is somewhat | ||
| 349 : | unnecessary because the computation is so simple anyway. | ||
| 350 : | |||
| 351 : | =cut | ||
| 352 : | |||
| 353 : | ################################################################ | ||
| 354 : | |||
| 355 : | sub rsq { | ||
| 356 : | my $this= shift(); | ||
| 357 : | return $this->{rsq}= 1.0- $this->{sse} / $this->sst(); | ||
| 358 : | } | ||
| 359 : | |||
| 360 : | sub adjrsq { | ||
| 361 : | my $this= shift(); | ||
| 362 : | return $this->{adjrsq}= 1.0- (1.0- $this->rsq())*($this->{n}-1)/($this->{n} - $this->{k}); | ||
| 363 : | } | ||
| 364 : | |||
| 365 : | sub sigmasq { | ||
| 366 : | my $this= shift(); | ||
| 367 : | return $this->{sigmasq}= ($this->{n}<=$this->{k}) ? "Inf" : ($this->{sse}/($this->{n} - $this->{k})); | ||
| 368 : | } | ||
| 369 : | |||
| 370 : | sub ybar { | ||
| 371 : | my $this= shift(); | ||
| 372 : | return $this->{ybar}= $this->{sy}/$this->{wghtn}; | ||
| 373 : | } | ||
| 374 : | |||
| 375 : | sub sst { | ||
| 376 : | my $this= shift(); | ||
| 377 : | return $this->{sst}= ($this->{syy} - $this->{wghtn}*($this->ybar())**2); | ||
| 378 : | } | ||
| 379 : | |||
| 380 : | sub k { | ||
| 381 : | my $this= shift(); | ||
| 382 : | return $this->{k}; | ||
| 383 : | } | ||
| 384 : | sub n { | ||
| 385 : | my $this= shift(); | ||
| 386 : | return $this->{n}; | ||
| 387 : | } | ||
| 388 : | |||
| 389 : | |||
| 390 : | ################################################################ | ||
| 391 : | =pod | ||
| 392 : | |||
| 393 : | =head1 DEBUGGING = SAMPLE USAGE CODE | ||
| 394 : | |||
| 395 : | The sample code included with this package demonstrates regression usage. | ||
| 396 : | To execute it, just set the constant DEBUGGING at the script head to 1, and | ||
| 397 : | do | ||
| 398 : | |||
| 399 : | perl Regression.pm | ||
| 400 : | |||
| 401 : | The printout should be | ||
| 402 : | |||
| 403 : | **************************************************************** | ||
| 404 : | Regression 'sample regression' | ||
| 405 : | **************************************************************** | ||
| 406 : | Theta[0='const']= 0.2950 | ||
| 407 : | Theta[1='someX']= 0.6723 | ||
| 408 : | Theta[2='someY']= 1.0688 | ||
| 409 : | R^2= 0.808, N= 4 | ||
| 410 : | **************************************************************** | ||
| 411 : | |||
| 412 : | =cut | ||
| 413 : | ################################################################ | ||
| 414 : | |||
| 415 : | if (DEBUGGING) { | ||
| 416 : | package main; | ||
| 417 : | |||
| 418 : | my $reg= Statistics::Regression->new( 3, "sample regression", [ "const", "someX", "someY" ] ); | ||
| 419 : | $reg->include( 2.0, [ 1.0, 3.0, -1.0 ] ); | ||
| 420 : | $reg->include( 1.0, [ 1.0, 5.0, 2.0 ] ); | ||
| 421 : | $reg->include( 20.0, [ 1.0, 31.0, 0.0 ] ); | ||
| 422 : | $reg->include( 15.0, [ 1.0, 11.0, 2.0 ] ); | ||
| 423 : | |||
| 424 : | # $reg->print(); or: my $coefs= $reg->theta(); print @coefs; print $reg->rsq; | ||
| 425 : | # my $coefs= $reg->theta(); print $coeff[0]; | ||
| 426 : | } | ||
| 427 : | |||
| 428 : | ################################################################ | ||
| 429 : | =pod | ||
| 430 : | |||
| 431 : | =head1 BUGS/PROBLEMS | ||
| 432 : | |||
| 433 : | =over 4 | ||
| 434 : | |||
| 435 : | =item Missing | ||
| 436 : | |||
| 437 : | This package lacks routines to compute the standard errors of | ||
| 438 : | the coefficients. This requires access to a matrix inversion | ||
| 439 : | package, and I do not have one at my disposal. If you want to | ||
| 440 : | add one, please let me know. | ||
| 441 : | |||
| 442 : | =item Perl Problem | ||
| 443 : | |||
| 444 : | perl is unaware of IEEE number representations. This makes it a | ||
| 445 : | pain to test whether an observation contains any missing | ||
| 446 : | variables (coded as 'NaN' in Regression.pm). | ||
| 447 : | |||
| 448 : | =item Others | ||
| 449 : | |||
| 450 : | I am a novice perl programmer, so this is probably ugly code. However, it | ||
| 451 : | does seem to work, and I could not find anything equivalent on cpan. | ||
| 452 : | |||
| 453 : | =back | ||
| 454 : | |||
| 455 : | =head1 INSTALLATION and DOCUMENTATION | ||
| 456 : | |||
| 457 : | Installation consists of moving the file 'Regression.pm' into a subdirectory | ||
| 458 : | Statistics of your modules path (e.g., /usr/lib/perl5/site_perl/5.6.0/). | ||
| 459 : | |||
| 460 : | The documentation was produced from the module: | ||
| 461 : | |||
| 462 : | pod2html -noindex -title "perl weighted least squares regression package" Regression.pm > Regression.html | ||
| 463 : | |||
| 464 : | The documentation was slightly modified by Maria Voloshina, University of Rochester. | ||
| 465 : | |||
| 466 : | =head1 LICENSE | ||
| 467 : | |||
| 468 : | This module is released for free public use under a GPL license. | ||
| 469 : | |||
| 470 : | (C) Ivo Welch, 2001. | ||
| 471 : | |||
| 472 : | =cut | ||
| 473 : | |||
| 474 : | ################################################################ | ||
| 475 : | 1; |
| aubreyja at gmail dot com | ViewVC Help |
| Powered by ViewVC 1.0.9 |