Parent Directory
|
Revision Log
removed unneccesary shebang lines
1 2 3 package Regression; 4 5 $VERSION = 0.1; 6 7 use strict; 8 9 ################################################################ 10 use constant TINY => 1e-8; 11 use constant DEBUGGING => 0; 12 ################################################################ 13 =pod 14 15 =head1 NAME 16 17 Regression.pm - weighted linear regression package (line+plane fitting) 18 19 =head1 DESCRIPTION 20 21 Regression.pm is a multivariate linear regression package. That is, 22 it estimates the c coefficients for a line-fit of the type 23 24 y= b(0)*x(0) + b(1)*x1 + b(2)*x2 + ... + b(k)*xk 25 26 given a data set of N observations, each with k independent x variables and 27 one y variable. Naturally, N must be greater than k---and preferably 28 considerably greater. Any reasonable undergraduate statistics book will 29 explain what a regression is. Most of the time, the user will provide a 30 constant ('1') as x(0) for each observation in order to allow the regression 31 package to fit an intercept. 32 33 =head1 USAGE 34 35 If the sample data for (x1, x2, y) includes ($x1[$i], $x2[$i], $y[$i]) for 0<=$i<=5, type 36 37 $reg = Regression->new( 3, "y", [ "const", "x1", "x2" ] ); 38 39 for($i=0; $i<6; $i++){ 40 $reg->include( $y[$i], [ 1.0, $x1[$i], $x2[$i] ] ); 41 } 42 43 @coeff= $reg->theta(); 44 45 $b0 = $coeff[0][0]; 46 $b1 = $coeff[0][1]; 47 $b2 = $coeff[0][2]; 48 49 =head1 ALGORITHM 50 51 =head2 Original Algorithm (ALGOL-60): 52 53 W. M. Gentleman, University of Waterloo, "Basic Description 54 For Large, Sparse Or Weighted Linear Least Squares Problems 55 (Algorithm AS 75)," Applied Statistics (1974) Vol 23; No. 3 56 57 =head2 INTERNALS 58 59 R=Rbar is an upperright triangular matrix, kept in normalized 60 form with implicit 1's on the diagonal. D is a diagonal scaling 61 matrix. These correspond to "standard Regression usage" as 62 63 X' X = R' D R 64 65 A backsubsitution routine (in thetacov) allows to invert the R 66 matrix (the inverse is upper-right triangular, too!). Call this 67 matrix H, that is H=R^(-1). 68 69 (X' X)^(-1) = [(R' D^(1/2)') (D^(1/2) R)]^(-1) 70 = [ R^-1 D^(-1/2) ] [ R^-1 D^(-1/2) ]' 71 72 73 74 =head2 Remarks 75 76 This algorithm is the statistical "standard." Insertion of a new 77 observation can be done one obs at any time (WITH A WEIGHT!), 78 and still only takes a low quadratic time. The storage space 79 requirement is of quadratic order (in the indep variables). A 80 practically infinite number of observations can easily be 81 processed! 82 83 =head1 AUTHOR 84 85 Naturally, Gentleman invented this algorithm. Adaptation by ivo 86 welch. Alan Miller (alan@dmsmelb.mel.dms.CSIRO.AU) pointed out 87 nicer ways to compute the R^2. 88 89 =head1 Subroutines 90 91 =cut 92 ################################################################ 93 94 95 #### let's start with handling of missing data ("nan" or "NaN") 96 97 my $nan= "NaN"; 98 sub isNaN { 99 if ($_[0] !~ /[0-9nan]/) { die "definitely not a number in NaN: '$_[0]'"; } 100 return ($_[0]=~ /NaN/i) || ($_[0] != $_[0]); 101 } 102 103 104 ################################################################ 105 106 =pod 107 108 =head2 new 109 110 receives the number of variables on each observations (i.e., an integer) and 111 returns the blessed data structure. Also takes an optional name for this 112 regression to remember, as well as a reference to a k*1 array of names for 113 the X coefficients. 114 115 =cut 116 117 ################################################################ 118 sub new { 119 my $classname= shift(@_); 120 my $K= shift(@_); # the number of variables 121 my $regname= shift(@_) || "with no name"; 122 123 if (!defined($K)) { die "Regression->new needs at least one argument for the number of variables"; } 124 if ($K<=1) { die "Cannot run a regression without at least two variables."; } 125 126 sub zerovec { 127 my @rv; 128 for (my $i=0; $i<=$_[0]; ++$i) { $rv[$i]=0; } 129 return \@rv; 130 } 131 132 bless { 133 k => $K, 134 regname => $regname, 135 xnames => shift(@_), 136 137 # constantly updated 138 n => 0, 139 sse => 0, 140 syy => 0, 141 sy => 0, 142 wghtn => 0, 143 d => zerovec($K), 144 thetabar => zerovec($K), 145 rbarsize => ($K+1)*$K/2+1, 146 rbar => zerovec(($K+1)*$K/2+1), 147 148 # other constants 149 neverabort => 0, 150 151 # computed on demand 152 theta => undef, 153 sigmasq => undef, 154 rsq => undef, 155 adjrsq => undef 156 }, $classname; 157 } 158 159 ################################################################ 160 161 =pod 162 163 =head2 dump 164 165 is used for debugging. 166 167 =cut 168 169 ################################################################ 170 sub dump { 171 my $this= $_[0]; 172 print "****************************************************************\n"; 173 print "Regression '$this->{regname}'\n"; 174 print "****************************************************************\n"; 175 sub print1val { 176 no strict; 177 print "$_[1]($_[2])=\t". ((defined($_[0]->{ $_[2] }) ? $_[0]->{ $_[2] } : "intentionally undef")); 178 179 my $ref=$_[0]->{ $_[2] }; 180 181 if (ref($ref) eq 'ARRAY') { 182 my $arrayref= $ref; 183 print " $#$arrayref+1 elements:\n"; 184 if ($#$arrayref>30) { 185 print "\t"; 186 for(my $i=0; $i<$#$arrayref+1; ++$i) { print "$i='$arrayref->[$i]';"; } 187 print "\n"; 188 } 189 else { 190 for(my $i=0; $i<$#$arrayref+1; ++$i) { print "\t$i=\t'$arrayref->[$i]'\n"; } 191 } 192 } 193 elsif (ref($ref) eq 'HASH') { 194 my $hashref= $ref; 195 print " ".scalar(keys(%$hashref))." elements\n"; 196 while (my ($key, $val) = each(%$hashref)) { 197 print "\t'$key'=>'$val';\n"; 198 } 199 } 200 else { 201 print " [was scalar]\n"; } 202 } 203 204 while (my ($key, $val) = each(%$this)) { 205 $this->print1val($key, $key); 206 } 207 print "****************************************************************\n"; 208 } 209 210 ################################################################ 211 =pod 212 213 =head2 print 214 215 prints the estimated coefficients, and R^2 and N. 216 217 =cut 218 ################################################################ 219 sub print { 220 my $this= $_[0]; 221 print "****************************************************************\n"; 222 print "Regression '$this->{regname}'\n"; 223 print "****************************************************************\n"; 224 225 my $theta= $this->theta(); 226 227 for (my $i=0; $i< $this->k(); ++$i) { 228 print "Theta[$i".(defined($this->{xnames}->[$i]) ? "='$this->{xnames}->[$i]'":"")."]= ".sprintf("%12.4f", $theta->[$i])."\n"; 229 } 230 print "R^2= ".sprintf("%.3f", $this->rsq()).", N= ".$this->n()."\n"; 231 print "****************************************************************\n"; 232 } 233 234 235 ################################################################ 236 =pod 237 238 =head2 include 239 240 receives one new observation. Call is 241 242 $blessedregr->include( $yvariable, [ $x1, $x2, $x3 ... $xk ], 1.0 ); 243 244 where 1.0 is an (optional) weight. Note that inclusion with a 245 weight of -1 can be used to delete an observation. 246 247 The function returns the number of observations so far included. 248 249 =cut 250 ################################################################ 251 sub include { 252 my $this = shift(); 253 my $yelement= shift(); 254 my $xrow= shift(); 255 my $weight= shift() || 1.0; 256 257 # omit observations with missing observations; 258 if (!defined($yelement)) { die "Internal Error: yelement is undef"; } 259 if (isNaN($yelement)) { return $this->{n}; } 260 261 my @xcopy; 262 for (my $i=1; $i<=$this->{k}; ++$i) { 263 if (!defined($xrow->[$i-1])) { die "Internal Error: xrow [ $i-1 ] is undef"; } 264 if (isNaN($xrow->[$i-1])) { return $this->{n}; } 265 $xcopy[$i]= $xrow->[$i-1]; 266 } 267 268 $this->{syy}+= ($weight*($yelement*$yelement)); 269 $this->{sy}+= ($weight*($yelement)); 270 if ($weight>=0.0) { ++$this->{n}; } else { --$this->{n}; } 271 272 $this->{wghtn}+= $weight; 273 274 for (my $i=1; $i<=$this->{k};++$i) { 275 if ($weight==0.0) { return $this->{n}; } 276 if (abs($xcopy[$i])>(TINY)) { 277 my $xi=$xcopy[$i]; 278 279 my $di=$this->{d}->[$i]; 280 my $dprimei=$di+$weight*($xi*$xi); 281 my $cbar= $di/$dprimei; 282 my $sbar= $weight*$xi/$dprimei; 283 $weight*=($cbar); 284 $this->{d}->[$i]=$dprimei; 285 my $nextr=int( (($i-1)*( (2.0*$this->{k}-$i))/2.0+1) ); 286 if (!($nextr<=$this->{rbarsize}) ) { die "Internal Error 2"; } 287 my $xk; 288 for (my $kc=$i+1;$kc<=$this->{k};++$kc) { 289 $xk=$xcopy[$kc]; $xcopy[$kc]=$xk-$xi*$this->{rbar}->[$nextr]; 290 $this->{rbar}->[$nextr]= $cbar * $this->{rbar}->[$nextr]+$sbar*$xk; 291 ++$nextr; 292 } 293 $xk=$yelement; $yelement-= $xi*$this->{thetabar}->[$i]; 294 $this->{thetabar}->[$i]= $cbar*$this->{thetabar}->[$i]+$sbar*$xk; 295 } 296 } 297 $this->{sse}+=$weight*($yelement*$yelement); 298 299 # indicate that Theta is garbage now 300 $this->{theta}= undef; 301 $this->{sigmasq}= undef; $this->{rsq}= undef; $this->{adjrsq}= undef; 302 303 return $this->{n}; 304 } 305 306 307 308 ################################################################ 309 310 =pod 311 312 =head2 theta 313 314 estimates and returns the vector of coefficients. 315 316 =cut 317 ################################################################ 318 319 sub theta { 320 my $this= shift(); 321 322 if (defined($this->{theta})) { return $this->{theta}; } 323 324 if ($this->{n} < $this->{k}) { return undef; } 325 for (my $i=($this->{k}); $i>=1; --$i) { 326 $this->{theta}->[$i]= $this->{thetabar}->[$i]; 327 my $nextr= int (($i-1)*((2.0*$this->{k}-$i))/2.0+1); 328 if (!($nextr<=$this->{rbarsize})) { die "Internal Error 3"; } 329 for (my $kc=$i+1;$kc<=$this->{k};++$kc) { 330 $this->{theta}->[$i]-=($this->{rbar}->[$nextr]*$this->{theta}->[$kc]); 331 ++$nextr; 332 } 333 } 334 335 my $ref= $this->{theta}; shift(@$ref); # we are counting from 0 336 337 # if in a scalar context, otherwise please return the array directly 338 return $this->{theta}; 339 } 340 341 ################################################################ 342 =pod 343 344 =head2 rsq, adjrsq, sigmasq, ybar, sst, k, n 345 346 These functions provide common auxiliary information. rsq, adjrsq, 347 sigmasq, sst, and ybar have not been checked but are likely correct. 348 The results are stored for later usage, although this is somewhat 349 unnecessary because the computation is so simple anyway. 350 351 =cut 352 353 ################################################################ 354 355 sub rsq { 356 my $this= shift(); 357 return $this->{rsq}= 1.0- $this->{sse} / $this->sst(); 358 } 359 360 sub adjrsq { 361 my $this= shift(); 362 return $this->{adjrsq}= 1.0- (1.0- $this->rsq())*($this->{n}-1)/($this->{n} - $this->{k}); 363 } 364 365 sub sigmasq { 366 my $this= shift(); 367 return $this->{sigmasq}= ($this->{n}<=$this->{k}) ? "Inf" : ($this->{sse}/($this->{n} - $this->{k})); 368 } 369 370 sub ybar { 371 my $this= shift(); 372 return $this->{ybar}= $this->{sy}/$this->{wghtn}; 373 } 374 375 sub sst { 376 my $this= shift(); 377 return $this->{sst}= ($this->{syy} - $this->{wghtn}*($this->ybar())**2); 378 } 379 380 sub k { 381 my $this= shift(); 382 return $this->{k}; 383 } 384 sub n { 385 my $this= shift(); 386 return $this->{n}; 387 } 388 389 390 ################################################################ 391 =pod 392 393 =head1 DEBUGGING = SAMPLE USAGE CODE 394 395 The sample code included with this package demonstrates regression usage. 396 To execute it, just set the constant DEBUGGING at the script head to 1, and 397 do 398 399 perl Regression.pm 400 401 The printout should be 402 403 **************************************************************** 404 Regression 'sample regression' 405 **************************************************************** 406 Theta[0='const']= 0.2950 407 Theta[1='someX']= 0.6723 408 Theta[2='someY']= 1.0688 409 R^2= 0.808, N= 4 410 **************************************************************** 411 412 =cut 413 ################################################################ 414 415 if (DEBUGGING) { 416 package main; 417 418 my $reg= Statistics::Regression->new( 3, "sample regression", [ "const", "someX", "someY" ] ); 419 $reg->include( 2.0, [ 1.0, 3.0, -1.0 ] ); 420 $reg->include( 1.0, [ 1.0, 5.0, 2.0 ] ); 421 $reg->include( 20.0, [ 1.0, 31.0, 0.0 ] ); 422 $reg->include( 15.0, [ 1.0, 11.0, 2.0 ] ); 423 424 # $reg->print(); or: my $coefs= $reg->theta(); print @coefs; print $reg->rsq; 425 # my $coefs= $reg->theta(); print $coeff[0]; 426 } 427 428 ################################################################ 429 =pod 430 431 =head1 BUGS/PROBLEMS 432 433 =over 4 434 435 =item Missing 436 437 This package lacks routines to compute the standard errors of 438 the coefficients. This requires access to a matrix inversion 439 package, and I do not have one at my disposal. If you want to 440 add one, please let me know. 441 442 =item Perl Problem 443 444 perl is unaware of IEEE number representations. This makes it a 445 pain to test whether an observation contains any missing 446 variables (coded as 'NaN' in Regression.pm). 447 448 =item Others 449 450 I am a novice perl programmer, so this is probably ugly code. However, it 451 does seem to work, and I could not find anything equivalent on cpan. 452 453 =back 454 455 =head1 INSTALLATION and DOCUMENTATION 456 457 Installation consists of moving the file 'Regression.pm' into a subdirectory 458 Statistics of your modules path (e.g., /usr/lib/perl5/site_perl/5.6.0/). 459 460 The documentation was produced from the module: 461 462 pod2html -noindex -title "perl weighted least squares regression package" Regression.pm > Regression.html 463 464 The documentation was slightly modified by Maria Voloshina, University of Rochester. 465 466 =head1 LICENSE 467 468 This module is released for free public use under a GPL license. 469 470 (C) Ivo Welch, 2001. 471 472 =cut 473 474 ################################################################ 475 1;
| aubreyja at gmail dot com | ViewVC Help |
| Powered by ViewVC 1.0.9 |