1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.math.stat.descriptive;
18
19 import java.io.Serializable;
20 import java.util.Arrays;
21
22 import org.apache.commons.math.DimensionMismatchException;
23 import org.apache.commons.math.linear.RealMatrix;
24 import org.apache.commons.math.stat.descriptive.moment.GeometricMean;
25 import org.apache.commons.math.stat.descriptive.moment.Mean;
26 import org.apache.commons.math.stat.descriptive.moment.VectorialCovariance;
27 import org.apache.commons.math.stat.descriptive.rank.Max;
28 import org.apache.commons.math.stat.descriptive.rank.Min;
29 import org.apache.commons.math.stat.descriptive.summary.Sum;
30 import org.apache.commons.math.stat.descriptive.summary.SumOfLogs;
31 import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
32 import org.apache.commons.math.util.MathUtils;
33
34 /**
35 * <p>Computes summary statistics for a stream of n-tuples added using the
36 * {@link #addValue(double[]) addValue} method. The data values are not stored
37 * in memory, so this class can be used to compute statistics for very large
38 * n-tuple streams.</p>
39 *
40 * <p>The {@link StorelessUnivariateStatistic} instances used to maintain
41 * summary state and compute statistics are configurable via setters.
42 * For example, the default implementation for the mean can be overridden by
43 * calling {@link #setMeanImpl(StorelessUnivariateStatistic[])}. Actual
44 * parameters to these methods must implement the
45 * {@link StorelessUnivariateStatistic} interface and configuration must be
46 * completed before <code>addValue</code> is called. No configuration is
47 * necessary to use the default, commons-math provided implementations.</p>
48 *
49 * <p>To compute statistics for a stream of n-tuples, construct a
50 * MultivariateStatistics instance with dimension n and then use
51 * {@link #addValue(double[])} to add n-tuples. The <code>getXxx</code>
52 * methods where Xxx is a statistic return an array of <code>double</code>
53 * values, where for <code>i = 0,...,n-1</code> the i<sup>th</sup> array element is the
54 * value of the given statistic for data range consisting of the i<sup>th</sup> element of
55 * each of the input n-tuples. For example, if <code>addValue</code> is called
56 * with actual parameters {0, 1, 2}, then {3, 4, 5} and finally {6, 7, 8},
57 * <code>getSum</code> will return a three-element array with values
58 * {0+3+6, 1+4+7, 2+5+8}</p>
59 *
60 * <p>Note: This class is not thread-safe. Use
61 * {@link SynchronizedMultivariateSummaryStatistics} if concurrent access from multiple
62 * threads is required.</p>
63 *
64 * @since 1.2
65 * @version $Revision: 618097 $ $Date: 2008-02-03 22:39:08 +0100 (dim., 03 févr. 2008) $
66 */
67 public class MultivariateSummaryStatistics
68 implements StatisticalMultivariateSummary, Serializable {
69
70 /** Serialization UID */
71 private static final long serialVersionUID = 2271900808994826718L;
72
73 /**
74 * Construct a MultivariateSummaryStatistics instance
75 * @param k dimension of the data
76 * @param isCovarianceBiasCorrected if true, the unbiased sample
77 * covariance is computed, otherwise the biased population covariance
78 * is computed
79 */
80 public MultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) {
81 this.k = k;
82
83 sumImpl = new StorelessUnivariateStatistic[k];
84 sumSqImpl = new StorelessUnivariateStatistic[k];
85 minImpl = new StorelessUnivariateStatistic[k];
86 maxImpl = new StorelessUnivariateStatistic[k];
87 sumLogImpl = new StorelessUnivariateStatistic[k];
88 geoMeanImpl = new StorelessUnivariateStatistic[k];
89 meanImpl = new StorelessUnivariateStatistic[k];
90
91 for (int i = 0; i < k; ++i) {
92 sumImpl[i] = new Sum();
93 sumSqImpl[i] = new SumOfSquares();
94 minImpl[i] = new Min();
95 maxImpl[i] = new Max();
96 sumLogImpl[i] = new SumOfLogs();
97 geoMeanImpl[i] = new GeometricMean();
98 meanImpl[i] = new Mean();
99 }
100
101 covarianceImpl =
102 new VectorialCovariance(k, isCovarianceBiasCorrected);
103
104 }
105
106 /** Dimension of the data. */
107 private int k;
108
109 /** Count of values that have been added */
110 private long n = 0;
111
112 /** Sum statistic implementation - can be reset by setter. */
113 private StorelessUnivariateStatistic[] sumImpl;
114
115 /** Sum of squares statistic implementation - can be reset by setter. */
116 private StorelessUnivariateStatistic[] sumSqImpl;
117
118 /** Minimum statistic implementation - can be reset by setter. */
119 private StorelessUnivariateStatistic[] minImpl;
120
121 /** Maximum statistic implementation - can be reset by setter. */
122 private StorelessUnivariateStatistic[] maxImpl;
123
124 /** Sum of log statistic implementation - can be reset by setter. */
125 private StorelessUnivariateStatistic[] sumLogImpl;
126
127 /** Geometric mean statistic implementation - can be reset by setter. */
128 private StorelessUnivariateStatistic[] geoMeanImpl;
129
130 /** Mean statistic implementation - can be reset by setter. */
131 private StorelessUnivariateStatistic[] meanImpl;
132
133 /** Covariance statistic implementation - cannot be reset. */
134 private VectorialCovariance covarianceImpl;
135
136 /**
137 * Add an n-tuple to the data
138 *
139 * @param value the n-tuple to add
140 * @throws DimensionMismatchException if the length of the array
141 * does not match the one used at construction
142 */
143 public void addValue(double[] value)
144 throws DimensionMismatchException {
145 checkDimension(value.length);
146 for (int i = 0; i < k; ++i) {
147 double v = value[i];
148 sumImpl[i].increment(v);
149 sumSqImpl[i].increment(v);
150 minImpl[i].increment(v);
151 maxImpl[i].increment(v);
152 sumLogImpl[i].increment(v);
153 geoMeanImpl[i].increment(v);
154 meanImpl[i].increment(v);
155 }
156 covarianceImpl.increment(value);
157 n++;
158 }
159
160 /**
161 * Returns the dimension of the data
162 * @return The dimension of the data
163 */
164 public int getDimension() {
165 return k;
166 }
167
168 /**
169 * Returns the number of available values
170 * @return The number of available values
171 */
172 public long getN() {
173 return n;
174 }
175
176 /**
177 * Returns an array of the results of a statistic.
178 * @param stats univariate statistic array
179 * @return results array
180 */
181 private double[] getResults(StorelessUnivariateStatistic[] stats) {
182 double[] results = new double[stats.length];
183 for (int i = 0; i < results.length; ++i) {
184 results[i] = stats[i].getResult();
185 }
186 return results;
187 }
188
189 /**
190 * Returns an array whose i<sup>th</sup> entry is the sum of the
191 * i<sup>th</sup> entries of the arrays that have been added using
192 * {@link #addValue(double[])}
193 *
194 * @return the array of component sums
195 */
196 public double[] getSum() {
197 return getResults(sumImpl);
198 }
199
200 /**
201 * Returns an array whose i<sup>th</sup> entry is the sum of squares of the
202 * i<sup>th</sup> entries of the arrays that have been added using
203 * {@link #addValue(double[])}
204 *
205 * @return the array of component sums of squares
206 */
207 public double[] getSumSq() {
208 return getResults(sumSqImpl);
209 }
210
211 /**
212 * Returns an array whose i<sup>th</sup> entry is the sum of logs of the
213 * i<sup>th</sup> entries of the arrays that have been added using
214 * {@link #addValue(double[])}
215 *
216 * @return the array of component log sums
217 */
218 public double[] getSumLog() {
219 return getResults(sumLogImpl);
220 }
221
222 /**
223 * Returns an array whose i<sup>th</sup> entry is the mean of the
224 * i<sup>th</sup> entries of the arrays that have been added using
225 * {@link #addValue(double[])}
226 *
227 * @return the array of component means
228 */
229 public double[] getMean() {
230 return getResults(meanImpl);
231 }
232
233 /**
234 * Returns an array whose i<sup>th</sup> entry is the standard deviation of the
235 * i<sup>th</sup> entries of the arrays that have been added using
236 * {@link #addValue(double[])}
237 *
238 * @return the array of component standard deviations
239 */
240 public double[] getStandardDeviation() {
241 double[] stdDev = new double[k];
242 if (getN() < 1) {
243 Arrays.fill(stdDev, Double.NaN);
244 } else if (getN() < 2) {
245 Arrays.fill(stdDev, 0.0);
246 } else {
247 RealMatrix matrix = covarianceImpl.getResult();
248 for (int i = 0; i < k; ++i) {
249 stdDev[i] = Math.sqrt(matrix.getEntry(i, i));
250 }
251 }
252 return stdDev;
253 }
254
255 /**
256 * Returns the covariance matrix of the values that have been added.
257 *
258 * @return the covariance matrix
259 */
260 public RealMatrix getCovariance() {
261 return covarianceImpl.getResult();
262 }
263
264 /**
265 * Returns an array whose i<sup>th</sup> entry is the maximum of the
266 * i<sup>th</sup> entries of the arrays that have been added using
267 * {@link #addValue(double[])}
268 *
269 * @return the array of component maxima
270 */
271 public double[] getMax() {
272 return getResults(maxImpl);
273 }
274
275 /**
276 * Returns an array whose i<sup>th</sup> entry is the minimum of the
277 * i<sup>th</sup> entries of the arrays that have been added using
278 * {@link #addValue(double[])}
279 *
280 * @return the array of component minima
281 */
282 public double[] getMin() {
283 return getResults(minImpl);
284 }
285
286 /**
287 * Returns an array whose i<sup>th</sup> entry is the geometric mean of the
288 * i<sup>th</sup> entries of the arrays that have been added using
289 * {@link #addValue(double[])}
290 *
291 * @return the array of component geometric means
292 */
293 public double[] getGeometricMean() {
294 return getResults(geoMeanImpl);
295 }
296
297 /**
298 * Generates a text report displaying
299 * summary statistics from values that
300 * have been added.
301 * @return String with line feeds displaying statistics
302 */
303 public String toString() {
304 StringBuffer outBuffer = new StringBuffer();
305 outBuffer.append("MultivariateSummaryStatistics:\n");
306 outBuffer.append("n: " + getN() + "\n");
307 append(outBuffer, getMin(), "min: ", ", ", "\n");
308 append(outBuffer, getMax(), "max: ", ", ", "\n");
309 append(outBuffer, getMean(), "mean: ", ", ", "\n");
310 append(outBuffer, getGeometricMean(), "geometric mean: ", ", ", "\n");
311 append(outBuffer, getSumSq(), "sum of squares: ", ", ", "\n");
312 append(outBuffer, getSumLog(), "sum of logarithms: ", ", ", "\n");
313 append(outBuffer, getStandardDeviation(), "standard deviation: ", ", ", "\n");
314 outBuffer.append("covariance: " + getCovariance().toString() + "\n");
315 return outBuffer.toString();
316 }
317
318 /**
319 * Append a text representation of an array to a buffer.
320 * @param buffer buffer to fill
321 * @param data data array
322 * @param prefix text prefix
323 * @param separator elements separator
324 * @param suffix text suffix
325 */
326 private void append(StringBuffer buffer, double[] data,
327 String prefix, String separator, String suffix) {
328 buffer.append(prefix);
329 for (int i = 0; i < data.length; ++i) {
330 if (i > 0) {
331 buffer.append(separator);
332 }
333 buffer.append(data[i]);
334 }
335 buffer.append(suffix);
336 }
337
338 /**
339 * Resets all statistics and storage
340 */
341 public void clear() {
342 this.n = 0;
343 for (int i = 0; i < k; ++i) {
344 minImpl[i].clear();
345 maxImpl[i].clear();
346 sumImpl[i].clear();
347 sumLogImpl[i].clear();
348 sumSqImpl[i].clear();
349 geoMeanImpl[i].clear();
350 meanImpl[i].clear();
351 }
352 covarianceImpl.clear();
353 }
354
355 /**
356 * Returns true iff <code>object</code> is a <code>SummaryStatistics</code>
357 * instance and all statistics have the same values as this.
358 * @param object the object to test equality against.
359 * @return true if object equals this
360 */
361 public boolean equals(Object object) {
362 if (object == this ) {
363 return true;
364 }
365 if (object instanceof MultivariateSummaryStatistics == false) {
366 return false;
367 }
368 MultivariateSummaryStatistics stat = (MultivariateSummaryStatistics) object;
369 return (MathUtils.equals(stat.getGeometricMean(),
370 this.getGeometricMean()) &&
371 MathUtils.equals(stat.getMax(), this.getMax()) &&
372 MathUtils.equals(stat.getMean(),this.getMean()) &&
373 MathUtils.equals(stat.getMin(),this.getMin()) &&
374 MathUtils.equals(stat.getN(), this.getN()) &&
375 MathUtils.equals(stat.getSum(), this.getSum()) &&
376 MathUtils.equals(stat.getSumSq(),this.getSumSq()) &&
377 MathUtils.equals(stat.getSumLog(),this.getSumLog()) &&
378 stat.getCovariance().equals(this.getCovariance()));
379 }
380
381 /**
382 * Returns hash code based on values of statistics
383 *
384 * @return hash code
385 */
386 public int hashCode() {
387 int result = 31 + MathUtils.hash(getGeometricMean());
388 result = result * 31 + MathUtils.hash(getGeometricMean());
389 result = result * 31 + MathUtils.hash(getMax());
390 result = result * 31 + MathUtils.hash(getMean());
391 result = result * 31 + MathUtils.hash(getMin());
392 result = result * 31 + MathUtils.hash(getN());
393 result = result * 31 + MathUtils.hash(getSum());
394 result = result * 31 + MathUtils.hash(getSumSq());
395 result = result * 31 + MathUtils.hash(getSumLog());
396 result = result * 31 + getCovariance().hashCode();
397 return result;
398 }
399
400 // Getters and setters for statistics implementations
401 /**
402 * Sets statistics implementations.
403 * @param newImpl new implementations for statistics
404 * @param oldImpl old implementations for statistics
405 * @throws DimensionMismatchException if the array dimension
406 * does not match the one used at construction
407 * @throws IllegalStateException if data has already been added
408 * (i.e if n > 0)
409 */
410 private void setImpl(StorelessUnivariateStatistic[] newImpl,
411 StorelessUnivariateStatistic[] oldImpl)
412 throws DimensionMismatchException, IllegalStateException {
413 checkEmpty();
414 checkDimension(newImpl.length);
415 System.arraycopy(newImpl, 0, oldImpl, 0, newImpl.length);
416 }
417
418 /**
419 * Returns the currently configured Sum implementation
420 *
421 * @return the StorelessUnivariateStatistic implementing the sum
422 */
423 public StorelessUnivariateStatistic[] getSumImpl() {
424 return (StorelessUnivariateStatistic[]) sumImpl.clone();
425 }
426
427 /**
428 * <p>Sets the implementation for the Sum.</p>
429 * <p>This method must be activated before any data has been added - i.e.,
430 * before {@link #addValue(double[]) addValue} has been used to add data;
431 * otherwise an IllegalStateException will be thrown.</p>
432 *
433 * @param sumImpl the StorelessUnivariateStatistic instance to use
434 * for computing the Sum
435 * @throws DimensionMismatchException if the array dimension
436 * does not match the one used at construction
437 * @throws IllegalStateException if data has already been added
438 * (i.e if n > 0)
439 */
440 public void setSumImpl(StorelessUnivariateStatistic[] sumImpl)
441 throws DimensionMismatchException {
442 setImpl(sumImpl, this.sumImpl);
443 }
444
445 /**
446 * Returns the currently configured sum of squares implementation
447 *
448 * @return the StorelessUnivariateStatistic implementing the sum of squares
449 */
450 public StorelessUnivariateStatistic[] getSumsqImpl() {
451 return (StorelessUnivariateStatistic[]) sumSqImpl.clone();
452 }
453
454 /**
455 * <p>Sets the implementation for the sum of squares.</p>
456 * <p>This method must be activated before any data has been added - i.e.,
457 * before {@link #addValue(double[]) addValue} has been used to add data;
458 * otherwise an IllegalStateException will be thrown.</p>
459 *
460 * @param sumsqImpl the StorelessUnivariateStatistic instance to use
461 * for computing the sum of squares
462 * @throws DimensionMismatchException if the array dimension
463 * does not match the one used at construction
464 * @throws IllegalStateException if data has already been added
465 * (i.e if n > 0)
466 */
467 public void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl)
468 throws DimensionMismatchException {
469 setImpl(sumsqImpl, this.sumSqImpl);
470 }
471
472 /**
473 * Returns the currently configured minimum implementation
474 *
475 * @return the StorelessUnivariateStatistic implementing the minimum
476 */
477 public StorelessUnivariateStatistic[] getMinImpl() {
478 return (StorelessUnivariateStatistic[]) minImpl.clone();
479 }
480
481 /**
482 * <p>Sets the implementation for the minimum.</p>
483 * <p>This method must be activated before any data has been added - i.e.,
484 * before {@link #addValue(double[]) addValue} has been used to add data;
485 * otherwise an IllegalStateException will be thrown.</p>
486 *
487 * @param minImpl the StorelessUnivariateStatistic instance to use
488 * for computing the minimum
489 * @throws DimensionMismatchException if the array dimension
490 * does not match the one used at construction
491 * @throws IllegalStateException if data has already been added
492 * (i.e if n > 0)
493 */
494 public void setMinImpl(StorelessUnivariateStatistic[] minImpl)
495 throws DimensionMismatchException {
496 setImpl(minImpl, this.minImpl);
497 }
498
499 /**
500 * Returns the currently configured maximum implementation
501 *
502 * @return the StorelessUnivariateStatistic implementing the maximum
503 */
504 public StorelessUnivariateStatistic[] getMaxImpl() {
505 return (StorelessUnivariateStatistic[]) maxImpl.clone();
506 }
507
508 /**
509 * <p>Sets the implementation for the maximum.</p>
510 * <p>This method must be activated before any data has been added - i.e.,
511 * before {@link #addValue(double[]) addValue} has been used to add data;
512 * otherwise an IllegalStateException will be thrown.</p>
513 *
514 * @param maxImpl the StorelessUnivariateStatistic instance to use
515 * for computing the maximum
516 * @throws DimensionMismatchException if the array dimension
517 * does not match the one used at construction
518 * @throws IllegalStateException if data has already been added
519 * (i.e if n > 0)
520 */
521 public void setMaxImpl(StorelessUnivariateStatistic[] maxImpl)
522 throws DimensionMismatchException {
523 setImpl(maxImpl, this.maxImpl);
524 }
525
526 /**
527 * Returns the currently configured sum of logs implementation
528 *
529 * @return the StorelessUnivariateStatistic implementing the log sum
530 */
531 public StorelessUnivariateStatistic[] getSumLogImpl() {
532 return (StorelessUnivariateStatistic[]) sumLogImpl.clone();
533 }
534
535 /**
536 * <p>Sets the implementation for the sum of logs.</p>
537 * <p>This method must be activated before any data has been added - i.e.,
538 * before {@link #addValue(double[]) addValue} has been used to add data;
539 * otherwise an IllegalStateException will be thrown.</p>
540 *
541 * @param sumLogImpl the StorelessUnivariateStatistic instance to use
542 * for computing the log sum
543 * @throws DimensionMismatchException if the array dimension
544 * does not match the one used at construction
545 * @throws IllegalStateException if data has already been added
546 * (i.e if n > 0)
547 */
548 public void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl)
549 throws DimensionMismatchException {
550 setImpl(sumLogImpl, this.sumLogImpl);
551 }
552
553 /**
554 * Returns the currently configured geometric mean implementation
555 *
556 * @return the StorelessUnivariateStatistic implementing the geometric mean
557 */
558 public StorelessUnivariateStatistic[] getGeoMeanImpl() {
559 return (StorelessUnivariateStatistic[]) geoMeanImpl.clone();
560 }
561
562 /**
563 * <p>Sets the implementation for the geometric mean.</p>
564 * <p>This method must be activated before any data has been added - i.e.,
565 * before {@link #addValue(double[]) addValue} has been used to add data;
566 * otherwise an IllegalStateException will be thrown.</p>
567 *
568 * @param geoMeanImpl the StorelessUnivariateStatistic instance to use
569 * for computing the geometric mean
570 * @throws DimensionMismatchException if the array dimension
571 * does not match the one used at construction
572 * @throws IllegalStateException if data has already been added
573 * (i.e if n > 0)
574 */
575 public void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl)
576 throws DimensionMismatchException {
577 setImpl(geoMeanImpl, this.geoMeanImpl);
578 }
579
580 /**
581 * Returns the currently configured mean implementation
582 *
583 * @return the StorelessUnivariateStatistic implementing the mean
584 */
585 public StorelessUnivariateStatistic[] getMeanImpl() {
586 return (StorelessUnivariateStatistic[]) meanImpl.clone();
587 }
588
589 /**
590 * <p>Sets the implementation for the mean.</p>
591 * <p>This method must be activated before any data has been added - i.e.,
592 * before {@link #addValue(double[]) addValue} has been used to add data;
593 * otherwise an IllegalStateException will be thrown.</p>
594 *
595 * @param meanImpl the StorelessUnivariateStatistic instance to use
596 * for computing the mean
597 * @throws DimensionMismatchException if the array dimension
598 * does not match the one used at construction
599 * @throws IllegalStateException if data has already been added
600 * (i.e if n > 0)
601 */
602 public void setMeanImpl(StorelessUnivariateStatistic[] meanImpl)
603 throws DimensionMismatchException {
604 setImpl(meanImpl, this.meanImpl);
605 }
606
607 /**
608 * Throws IllegalStateException if n > 0.
609 */
610 private void checkEmpty() {
611 if (n > 0) {
612 throw new IllegalStateException(
613 "Implementations must be configured before values are added.");
614 }
615 }
616
617 /**
618 * Throws DimensionMismatchException if dimension != k.
619 * @param dimension dimension to check
620 * @throws DimensionMismatchException if dimension != k
621 */
622 private void checkDimension(int dimension)
623 throws DimensionMismatchException {
624 if (dimension != k) {
625 throw new DimensionMismatchException(dimension, k);
626 }
627 }
628
629 }