1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.math.stat.regression;
19 import java.io.Serializable;
20
21 import org.apache.commons.math.MathException;
22 import org.apache.commons.math.distribution.TDistribution;
23 import org.apache.commons.math.distribution.TDistributionImpl;
24
25 /**
26 * Estimates an ordinary least squares regression model
27 * with one independent variable.
28 * <p>
29 * <code> y = intercept + slope * x </code></p>
30 * <p>
31 * Standard errors for <code>intercept</code> and <code>slope</code> are
32 * available as well as ANOVA, r-square and Pearson's r statistics.</p>
33 * <p>
34 * Observations (x,y pairs) can be added to the model one at a time or they
35 * can be provided in a 2-dimensional array. The observations are not stored
36 * in memory, so there is no limit to the number of observations that can be
37 * added to the model.</p>
38 * <p>
39 * <strong>Usage Notes</strong>: <ul>
40 * <li> When there are fewer than two observations in the model, or when
41 * there is no variation in the x values (i.e. all x values are the same)
42 * all statistics return <code>NaN</code>. At least two observations with
43 * different x coordinates are requred to estimate a bivariate regression
44 * model.
45 * </li>
46 * <li> getters for the statistics always compute values based on the current
47 * set of observations -- i.e., you can get statistics, then add more data
48 * and get updated statistics without using a new instance. There is no
49 * "compute" method that updates all statistics. Each of the getters performs
50 * the necessary computations to return the requested statistic.</li>
51 * </ul></p>
52 *
53 * @version $Revision: 617953 $ $Date: 2008-02-02 22:54:00 -0700 (Sat, 02 Feb 2008) $
54 */
55 public class SimpleRegression implements Serializable {
56
57 /** Serializable version identifier */
58 private static final long serialVersionUID = -3004689053607543335L;
59
60 /** the distribution used to compute inference statistics. */
61 private TDistribution distribution;
62
63 /** sum of x values */
64 private double sumX = 0d;
65
66 /** total variation in x (sum of squared deviations from xbar) */
67 private double sumXX = 0d;
68
69 /** sum of y values */
70 private double sumY = 0d;
71
72 /** total variation in y (sum of squared deviations from ybar) */
73 private double sumYY = 0d;
74
75 /** sum of products */
76 private double sumXY = 0d;
77
78 /** number of observations */
79 private long n = 0;
80
81 /** mean of accumulated x values, used in updating formulas */
82 private double xbar = 0;
83
84 /** mean of accumulated y values, used in updating formulas */
85 private double ybar = 0;
86
87 // ---------------------Public methods--------------------------------------
88
89 /**
90 * Create an empty SimpleRegression instance
91 */
92 public SimpleRegression() {
93 this(new TDistributionImpl(1.0));
94 }
95
96 /**
97 * Create an empty SimpleRegression using the given distribution object to
98 * compute inference statistics.
99 * @param t the distribution used to compute inference statistics.
100 * @since 1.2
101 */
102 public SimpleRegression(TDistribution t) {
103 super();
104 setDistribution(t);
105 }
106
107 /**
108 * Adds the observation (x,y) to the regression data set.
109 * <p>
110 * Uses updating formulas for means and sums of squares defined in
111 * "Algorithms for Computing the Sample Variance: Analysis and
112 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
113 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
114 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
115 *
116 *
117 * @param x independent variable value
118 * @param y dependent variable value
119 */
120 public void addData(double x, double y) {
121 if (n == 0) {
122 xbar = x;
123 ybar = y;
124 } else {
125 double dx = x - xbar;
126 double dy = y - ybar;
127 sumXX += dx * dx * (double) n / (double) (n + 1.0);
128 sumYY += dy * dy * (double) n / (double) (n + 1.0);
129 sumXY += dx * dy * (double) n / (double) (n + 1.0);
130 xbar += dx / (double) (n + 1.0);
131 ybar += dy / (double) (n + 1.0);
132 }
133 sumX += x;
134 sumY += y;
135 n++;
136
137 if (n > 2) {
138 distribution.setDegreesOfFreedom(n - 2);
139 }
140 }
141
142 /**
143 * Adds the observations represented by the elements in
144 * <code>data</code>.
145 * <p>
146 * <code>(data[0][0],data[0][1])</code> will be the first observation, then
147 * <code>(data[1][0],data[1][1])</code>, etc.</p>
148 * <p>
149 * This method does not replace data that has already been added. The
150 * observations represented by <code>data</code> are added to the existing
151 * dataset.</p>
152 * <p>
153 * To replace all data, use <code>clear()</code> before adding the new
154 * data.</p>
155 *
156 * @param data array of observations to be added
157 */
158 public void addData(double[][] data) {
159 for (int i = 0; i < data.length; i++) {
160 addData(data[i][0], data[i][1]);
161 }
162 }
163
164 /**
165 * Clears all data from the model.
166 */
167 public void clear() {
168 sumX = 0d;
169 sumXX = 0d;
170 sumY = 0d;
171 sumYY = 0d;
172 sumXY = 0d;
173 n = 0;
174 }
175
176 /**
177 * Returns the number of observations that have been added to the model.
178 *
179 * @return n number of observations that have been added.
180 */
181 public long getN() {
182 return n;
183 }
184
185 /**
186 * Returns the "predicted" <code>y</code> value associated with the
187 * supplied <code>x</code> value, based on the data that has been
188 * added to the model when this method is activated.
189 * <p>
190 * <code> predict(x) = intercept + slope * x </code></p>
191 * <p>
192 * <strong>Preconditions</strong>: <ul>
193 * <li>At least two observations (with at least two different x values)
194 * must have been added before invoking this method. If this method is
195 * invoked before a model can be estimated, <code>Double,NaN</code> is
196 * returned.
197 * </li></ul></p>
198 *
199 * @param x input <code>x</code> value
200 * @return predicted <code>y</code> value
201 */
202 public double predict(double x) {
203 double b1 = getSlope();
204 return getIntercept(b1) + b1 * x;
205 }
206
207 /**
208 * Returns the intercept of the estimated regression line.
209 * <p>
210 * The least squares estimate of the intercept is computed using the
211 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
212 * The intercept is sometimes denoted b0.</p>
213 * <p>
214 * <strong>Preconditions</strong>: <ul>
215 * <li>At least two observations (with at least two different x values)
216 * must have been added before invoking this method. If this method is
217 * invoked before a model can be estimated, <code>Double,NaN</code> is
218 * returned.
219 * </li></ul></p>
220 *
221 * @return the intercept of the regression line
222 */
223 public double getIntercept() {
224 return getIntercept(getSlope());
225 }
226
227 /**
228 * Returns the slope of the estimated regression line.
229 * <p>
230 * The least squares estimate of the slope is computed using the
231 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
232 * The slope is sometimes denoted b1.</p>
233 * <p>
234 * <strong>Preconditions</strong>: <ul>
235 * <li>At least two observations (with at least two different x values)
236 * must have been added before invoking this method. If this method is
237 * invoked before a model can be estimated, <code>Double.NaN</code> is
238 * returned.
239 * </li></ul></p>
240 *
241 * @return the slope of the regression line
242 */
243 public double getSlope() {
244 if (n < 2) {
245 return Double.NaN; //not enough data
246 }
247 if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {
248 return Double.NaN; //not enough variation in x
249 }
250 return sumXY / sumXX;
251 }
252
253 /**
254 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
255 * sum of squared errors</a> (SSE) associated with the regression
256 * model.
257 * <p>
258 * The sum is computed using the computational formula</p>
259 * <p>
260 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
261 * <p>
262 * where <code>SYY</code> is the sum of the squared deviations of the y
263 * values about their mean, <code>SXX</code> is similarly defined and
264 * <code>SXY</code> is the sum of the products of x and y mean deviations.
265 * </p><p>
266 * The sums are accumulated using the updating algorithm referenced in
267 * {@link #addData}.</p>
268 * <p>
269 * The return value is constrained to be non-negative - i.e., if due to
270 * rounding errors the computational formula returns a negative result,
271 * 0 is returned.</p>
272 * <p>
273 * <strong>Preconditions</strong>: <ul>
274 * <li>At least two observations (with at least two different x values)
275 * must have been added before invoking this method. If this method is
276 * invoked before a model can be estimated, <code>Double,NaN</code> is
277 * returned.
278 * </li></ul></p>
279 *
280 * @return sum of squared errors associated with the regression model
281 */
282 public double getSumSquaredErrors() {
283 return Math.max(0d, sumYY - sumXY * sumXY / sumXX);
284 }
285
286 /**
287 * Returns the sum of squared deviations of the y values about their mean.
288 * <p>
289 * This is defined as SSTO
290 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
291 * <p>
292 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
293 *
294 * @return sum of squared deviations of y values
295 */
296 public double getTotalSumSquares() {
297 if (n < 2) {
298 return Double.NaN;
299 }
300 return sumYY;
301 }
302
303 /**
304 * Returns the sum of squared deviations of the predicted y values about
305 * their mean (which equals the mean of y).
306 * <p>
307 * This is usually abbreviated SSR or SSM. It is defined as SSM
308 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
309 * <p>
310 * <strong>Preconditions</strong>: <ul>
311 * <li>At least two observations (with at least two different x values)
312 * must have been added before invoking this method. If this method is
313 * invoked before a model can be estimated, <code>Double.NaN</code> is
314 * returned.
315 * </li></ul></p>
316 *
317 * @return sum of squared deviations of predicted y values
318 */
319 public double getRegressionSumSquares() {
320 return getRegressionSumSquares(getSlope());
321 }
322
323 /**
324 * Returns the sum of squared errors divided by the degrees of freedom,
325 * usually abbreviated MSE.
326 * <p>
327 * If there are fewer than <strong>three</strong> data pairs in the model,
328 * or if there is no variation in <code>x</code>, this returns
329 * <code>Double.NaN</code>.</p>
330 *
331 * @return sum of squared deviations of y values
332 */
333 public double getMeanSquareError() {
334 if (n < 3) {
335 return Double.NaN;
336 }
337 return getSumSquaredErrors() / (double) (n - 2);
338 }
339
340 /**
341 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
342 * Pearson's product moment correlation coefficient</a>,
343 * usually denoted r.
344 * <p>
345 * <strong>Preconditions</strong>: <ul>
346 * <li>At least two observations (with at least two different x values)
347 * must have been added before invoking this method. If this method is
348 * invoked before a model can be estimated, <code>Double,NaN</code> is
349 * returned.
350 * </li></ul></p>
351 *
352 * @return Pearson's r
353 */
354 public double getR() {
355 double b1 = getSlope();
356 double result = Math.sqrt(getRSquare());
357 if (b1 < 0) {
358 result = -result;
359 }
360 return result;
361 }
362
363 /**
364 * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
365 * coefficient of determination</a>,
366 * usually denoted r-square.
367 * <p>
368 * <strong>Preconditions</strong>: <ul>
369 * <li>At least two observations (with at least two different x values)
370 * must have been added before invoking this method. If this method is
371 * invoked before a model can be estimated, <code>Double,NaN</code> is
372 * returned.
373 * </li></ul></p>
374 *
375 * @return r-square
376 */
377 public double getRSquare() {
378 double ssto = getTotalSumSquares();
379 return (ssto - getSumSquaredErrors()) / ssto;
380 }
381
382 /**
383 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
384 * standard error of the intercept estimate</a>,
385 * usually denoted s(b0).
386 * <p>
387 * If there are fewer that <strong>three</strong> observations in the
388 * model, or if there is no variation in x, this returns
389 * <code>Double.NaN</code>.</p>
390 *
391 * @return standard error associated with intercept estimate
392 */
393 public double getInterceptStdErr() {
394 return Math.sqrt(
395 getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
396 }
397
398 /**
399 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
400 * error of the slope estimate</a>,
401 * usually denoted s(b1).
402 * <p>
403 * If there are fewer that <strong>three</strong> data pairs in the model,
404 * or if there is no variation in x, this returns <code>Double.NaN</code>.
405 * </p>
406 *
407 * @return standard error associated with slope estimate
408 */
409 public double getSlopeStdErr() {
410 return Math.sqrt(getMeanSquareError() / sumXX);
411 }
412
413 /**
414 * Returns the half-width of a 95% confidence interval for the slope
415 * estimate.
416 * <p>
417 * The 95% confidence interval is</p>
418 * <p>
419 * <code>(getSlope() - getSlopeConfidenceInterval(),
420 * getSlope() + getSlopeConfidenceInterval())</code></p>
421 * <p>
422 * If there are fewer that <strong>three</strong> observations in the
423 * model, or if there is no variation in x, this returns
424 * <code>Double.NaN</code>.</p>
425 * <p>
426 * <strong>Usage Note</strong>:<br>
427 * The validity of this statistic depends on the assumption that the
428 * observations included in the model are drawn from a
429 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
430 * Bivariate Normal Distribution</a>.</p>
431 *
432 * @return half-width of 95% confidence interval for the slope estimate
433 * @throws MathException if the confidence interval can not be computed.
434 */
435 public double getSlopeConfidenceInterval() throws MathException {
436 return getSlopeConfidenceInterval(0.05d);
437 }
438
439 /**
440 * Returns the half-width of a (100-100*alpha)% confidence interval for
441 * the slope estimate.
442 * <p>
443 * The (100-100*alpha)% confidence interval is </p>
444 * <p>
445 * <code>(getSlope() - getSlopeConfidenceInterval(),
446 * getSlope() + getSlopeConfidenceInterval())</code></p>
447 * <p>
448 * To request, for example, a 99% confidence interval, use
449 * <code>alpha = .01</code></p>
450 * <p>
451 * <strong>Usage Note</strong>:<br>
452 * The validity of this statistic depends on the assumption that the
453 * observations included in the model are drawn from a
454 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
455 * Bivariate Normal Distribution</a>.</p>
456 * <p>
457 * <strong> Preconditions:</strong><ul>
458 * <li>If there are fewer that <strong>three</strong> observations in the
459 * model, or if there is no variation in x, this returns
460 * <code>Double.NaN</code>.
461 * </li>
462 * <li><code>(0 < alpha < 1)</code>; otherwise an
463 * <code>IllegalArgumentException</code> is thrown.
464 * </li></ul></p>
465 *
466 * @param alpha the desired significance level
467 * @return half-width of 95% confidence interval for the slope estimate
468 * @throws MathException if the confidence interval can not be computed.
469 */
470 public double getSlopeConfidenceInterval(double alpha)
471 throws MathException {
472 if (alpha >= 1 || alpha <= 0) {
473 throw new IllegalArgumentException();
474 }
475 return getSlopeStdErr() *
476 distribution.inverseCumulativeProbability(1d - alpha / 2d);
477 }
478
479 /**
480 * Returns the significance level of the slope (equiv) correlation.
481 * <p>
482 * Specifically, the returned value is the smallest <code>alpha</code>
483 * such that the slope confidence interval with significance level
484 * equal to <code>alpha</code> does not include <code>0</code>.
485 * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
486 * </p><p>
487 * <strong>Usage Note</strong>:<br>
488 * The validity of this statistic depends on the assumption that the
489 * observations included in the model are drawn from a
490 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
491 * Bivariate Normal Distribution</a>.</p>
492 * <p>
493 * If there are fewer that <strong>three</strong> observations in the
494 * model, or if there is no variation in x, this returns
495 * <code>Double.NaN</code>.</p>
496 *
497 * @return significance level for slope/correlation
498 * @throws MathException if the significance level can not be computed.
499 */
500 public double getSignificance() throws MathException {
501 return 2d * (1.0 - distribution.cumulativeProbability(
502 Math.abs(getSlope()) / getSlopeStdErr()));
503 }
504
505 // ---------------------Private methods-----------------------------------
506
507 /**
508 * Returns the intercept of the estimated regression line, given the slope.
509 * <p>
510 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
511 *
512 * @param slope current slope
513 * @return the intercept of the regression line
514 */
515 private double getIntercept(double slope) {
516 return (sumY - slope * sumX) / ((double) n);
517 }
518
519 /**
520 * Computes SSR from b1.
521 *
522 * @param slope regression slope estimate
523 * @return sum of squared deviations of predicted y values
524 */
525 private double getRegressionSumSquares(double slope) {
526 return slope * slope * sumXX;
527 }
528
529 /**
530 * Modify the distribution used to compute inference statistics.
531 * @param value the new distribution
532 * @since 1.2
533 */
534 public void setDistribution(TDistribution value) {
535 distribution = value;
536
537 // modify degrees of freedom
538 if (n > 2) {
539 distribution.setDegreesOfFreedom(n - 2);
540 }
541 }
542 }