001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.math.stat.descriptive; 018 019 import java.io.Serializable; 020 import java.util.Arrays; 021 022 import org.apache.commons.math.DimensionMismatchException; 023 import org.apache.commons.math.MathRuntimeException; 024 import org.apache.commons.math.linear.RealMatrix; 025 import org.apache.commons.math.stat.descriptive.moment.GeometricMean; 026 import org.apache.commons.math.stat.descriptive.moment.Mean; 027 import org.apache.commons.math.stat.descriptive.moment.VectorialCovariance; 028 import org.apache.commons.math.stat.descriptive.rank.Max; 029 import org.apache.commons.math.stat.descriptive.rank.Min; 030 import org.apache.commons.math.stat.descriptive.summary.Sum; 031 import org.apache.commons.math.stat.descriptive.summary.SumOfLogs; 032 import org.apache.commons.math.stat.descriptive.summary.SumOfSquares; 033 import org.apache.commons.math.util.MathUtils; 034 035 /** 036 * <p>Computes summary statistics for a stream of n-tuples added using the 037 * {@link #addValue(double[]) addValue} method. The data values are not stored 038 * in memory, so this class can be used to compute statistics for very large 039 * n-tuple streams.</p> 040 * 041 * <p>The {@link StorelessUnivariateStatistic} instances used to maintain 042 * summary state and compute statistics are configurable via setters. 043 * For example, the default implementation for the mean can be overridden by 044 * calling {@link #setMeanImpl(StorelessUnivariateStatistic[])}. Actual 045 * parameters to these methods must implement the 046 * {@link StorelessUnivariateStatistic} interface and configuration must be 047 * completed before <code>addValue</code> is called. No configuration is 048 * necessary to use the default, commons-math provided implementations.</p> 049 * 050 * <p>To compute statistics for a stream of n-tuples, construct a 051 * MultivariateStatistics instance with dimension n and then use 052 * {@link #addValue(double[])} to add n-tuples. The <code>getXxx</code> 053 * methods where Xxx is a statistic return an array of <code>double</code> 054 * values, where for <code>i = 0,...,n-1</code> the i<sup>th</sup> array element is the 055 * value of the given statistic for data range consisting of the i<sup>th</sup> element of 056 * each of the input n-tuples. For example, if <code>addValue</code> is called 057 * with actual parameters {0, 1, 2}, then {3, 4, 5} and finally {6, 7, 8}, 058 * <code>getSum</code> will return a three-element array with values 059 * {0+3+6, 1+4+7, 2+5+8}</p> 060 * 061 * <p>Note: This class is not thread-safe. Use 062 * {@link SynchronizedMultivariateSummaryStatistics} if concurrent access from multiple 063 * threads is required.</p> 064 * 065 * @since 1.2 066 * @version $Revision: 811833 $ $Date: 2009-09-06 12:27:50 -0400 (Sun, 06 Sep 2009) $ 067 */ 068 public class MultivariateSummaryStatistics 069 implements StatisticalMultivariateSummary, Serializable { 070 071 /** Serialization UID */ 072 private static final long serialVersionUID = 2271900808994826718L; 073 074 /** Dimension of the data. */ 075 private int k; 076 077 /** Count of values that have been added */ 078 private long n = 0; 079 080 /** Sum statistic implementation - can be reset by setter. */ 081 private StorelessUnivariateStatistic[] sumImpl; 082 083 /** Sum of squares statistic implementation - can be reset by setter. */ 084 private StorelessUnivariateStatistic[] sumSqImpl; 085 086 /** Minimum statistic implementation - can be reset by setter. */ 087 private StorelessUnivariateStatistic[] minImpl; 088 089 /** Maximum statistic implementation - can be reset by setter. */ 090 private StorelessUnivariateStatistic[] maxImpl; 091 092 /** Sum of log statistic implementation - can be reset by setter. */ 093 private StorelessUnivariateStatistic[] sumLogImpl; 094 095 /** Geometric mean statistic implementation - can be reset by setter. */ 096 private StorelessUnivariateStatistic[] geoMeanImpl; 097 098 /** Mean statistic implementation - can be reset by setter. */ 099 private StorelessUnivariateStatistic[] meanImpl; 100 101 /** Covariance statistic implementation - cannot be reset. */ 102 private VectorialCovariance covarianceImpl; 103 104 /** 105 * Construct a MultivariateSummaryStatistics instance 106 * @param k dimension of the data 107 * @param isCovarianceBiasCorrected if true, the unbiased sample 108 * covariance is computed, otherwise the biased population covariance 109 * is computed 110 */ 111 public MultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) { 112 this.k = k; 113 114 sumImpl = new StorelessUnivariateStatistic[k]; 115 sumSqImpl = new StorelessUnivariateStatistic[k]; 116 minImpl = new StorelessUnivariateStatistic[k]; 117 maxImpl = new StorelessUnivariateStatistic[k]; 118 sumLogImpl = new StorelessUnivariateStatistic[k]; 119 geoMeanImpl = new StorelessUnivariateStatistic[k]; 120 meanImpl = new StorelessUnivariateStatistic[k]; 121 122 for (int i = 0; i < k; ++i) { 123 sumImpl[i] = new Sum(); 124 sumSqImpl[i] = new SumOfSquares(); 125 minImpl[i] = new Min(); 126 maxImpl[i] = new Max(); 127 sumLogImpl[i] = new SumOfLogs(); 128 geoMeanImpl[i] = new GeometricMean(); 129 meanImpl[i] = new Mean(); 130 } 131 132 covarianceImpl = 133 new VectorialCovariance(k, isCovarianceBiasCorrected); 134 135 } 136 137 /** 138 * Add an n-tuple to the data 139 * 140 * @param value the n-tuple to add 141 * @throws DimensionMismatchException if the length of the array 142 * does not match the one used at construction 143 */ 144 public void addValue(double[] value) 145 throws DimensionMismatchException { 146 checkDimension(value.length); 147 for (int i = 0; i < k; ++i) { 148 double v = value[i]; 149 sumImpl[i].increment(v); 150 sumSqImpl[i].increment(v); 151 minImpl[i].increment(v); 152 maxImpl[i].increment(v); 153 sumLogImpl[i].increment(v); 154 geoMeanImpl[i].increment(v); 155 meanImpl[i].increment(v); 156 } 157 covarianceImpl.increment(value); 158 n++; 159 } 160 161 /** 162 * Returns the dimension of the data 163 * @return The dimension of the data 164 */ 165 public int getDimension() { 166 return k; 167 } 168 169 /** 170 * Returns the number of available values 171 * @return The number of available values 172 */ 173 public long getN() { 174 return n; 175 } 176 177 /** 178 * Returns an array of the results of a statistic. 179 * @param stats univariate statistic array 180 * @return results array 181 */ 182 private double[] getResults(StorelessUnivariateStatistic[] stats) { 183 double[] results = new double[stats.length]; 184 for (int i = 0; i < results.length; ++i) { 185 results[i] = stats[i].getResult(); 186 } 187 return results; 188 } 189 190 /** 191 * Returns an array whose i<sup>th</sup> entry is the sum of the 192 * i<sup>th</sup> entries of the arrays that have been added using 193 * {@link #addValue(double[])} 194 * 195 * @return the array of component sums 196 */ 197 public double[] getSum() { 198 return getResults(sumImpl); 199 } 200 201 /** 202 * Returns an array whose i<sup>th</sup> entry is the sum of squares of the 203 * i<sup>th</sup> entries of the arrays that have been added using 204 * {@link #addValue(double[])} 205 * 206 * @return the array of component sums of squares 207 */ 208 public double[] getSumSq() { 209 return getResults(sumSqImpl); 210 } 211 212 /** 213 * Returns an array whose i<sup>th</sup> entry is the sum of logs of the 214 * i<sup>th</sup> entries of the arrays that have been added using 215 * {@link #addValue(double[])} 216 * 217 * @return the array of component log sums 218 */ 219 public double[] getSumLog() { 220 return getResults(sumLogImpl); 221 } 222 223 /** 224 * Returns an array whose i<sup>th</sup> entry is the mean of the 225 * i<sup>th</sup> entries of the arrays that have been added using 226 * {@link #addValue(double[])} 227 * 228 * @return the array of component means 229 */ 230 public double[] getMean() { 231 return getResults(meanImpl); 232 } 233 234 /** 235 * Returns an array whose i<sup>th</sup> entry is the standard deviation of the 236 * i<sup>th</sup> entries of the arrays that have been added using 237 * {@link #addValue(double[])} 238 * 239 * @return the array of component standard deviations 240 */ 241 public double[] getStandardDeviation() { 242 double[] stdDev = new double[k]; 243 if (getN() < 1) { 244 Arrays.fill(stdDev, Double.NaN); 245 } else if (getN() < 2) { 246 Arrays.fill(stdDev, 0.0); 247 } else { 248 RealMatrix matrix = covarianceImpl.getResult(); 249 for (int i = 0; i < k; ++i) { 250 stdDev[i] = Math.sqrt(matrix.getEntry(i, i)); 251 } 252 } 253 return stdDev; 254 } 255 256 /** 257 * Returns the covariance matrix of the values that have been added. 258 * 259 * @return the covariance matrix 260 */ 261 public RealMatrix getCovariance() { 262 return covarianceImpl.getResult(); 263 } 264 265 /** 266 * Returns an array whose i<sup>th</sup> entry is the maximum of the 267 * i<sup>th</sup> entries of the arrays that have been added using 268 * {@link #addValue(double[])} 269 * 270 * @return the array of component maxima 271 */ 272 public double[] getMax() { 273 return getResults(maxImpl); 274 } 275 276 /** 277 * Returns an array whose i<sup>th</sup> entry is the minimum of the 278 * i<sup>th</sup> entries of the arrays that have been added using 279 * {@link #addValue(double[])} 280 * 281 * @return the array of component minima 282 */ 283 public double[] getMin() { 284 return getResults(minImpl); 285 } 286 287 /** 288 * Returns an array whose i<sup>th</sup> entry is the geometric mean of the 289 * i<sup>th</sup> entries of the arrays that have been added using 290 * {@link #addValue(double[])} 291 * 292 * @return the array of component geometric means 293 */ 294 public double[] getGeometricMean() { 295 return getResults(geoMeanImpl); 296 } 297 298 /** 299 * Generates a text report displaying 300 * summary statistics from values that 301 * have been added. 302 * @return String with line feeds displaying statistics 303 */ 304 @Override 305 public String toString() { 306 StringBuffer outBuffer = new StringBuffer(); 307 outBuffer.append("MultivariateSummaryStatistics:\n"); 308 outBuffer.append("n: " + getN() + "\n"); 309 append(outBuffer, getMin(), "min: ", ", ", "\n"); 310 append(outBuffer, getMax(), "max: ", ", ", "\n"); 311 append(outBuffer, getMean(), "mean: ", ", ", "\n"); 312 append(outBuffer, getGeometricMean(), "geometric mean: ", ", ", "\n"); 313 append(outBuffer, getSumSq(), "sum of squares: ", ", ", "\n"); 314 append(outBuffer, getSumLog(), "sum of logarithms: ", ", ", "\n"); 315 append(outBuffer, getStandardDeviation(), "standard deviation: ", ", ", "\n"); 316 outBuffer.append("covariance: " + getCovariance().toString() + "\n"); 317 return outBuffer.toString(); 318 } 319 320 /** 321 * Append a text representation of an array to a buffer. 322 * @param buffer buffer to fill 323 * @param data data array 324 * @param prefix text prefix 325 * @param separator elements separator 326 * @param suffix text suffix 327 */ 328 private void append(StringBuffer buffer, double[] data, 329 String prefix, String separator, String suffix) { 330 buffer.append(prefix); 331 for (int i = 0; i < data.length; ++i) { 332 if (i > 0) { 333 buffer.append(separator); 334 } 335 buffer.append(data[i]); 336 } 337 buffer.append(suffix); 338 } 339 340 /** 341 * Resets all statistics and storage 342 */ 343 public void clear() { 344 this.n = 0; 345 for (int i = 0; i < k; ++i) { 346 minImpl[i].clear(); 347 maxImpl[i].clear(); 348 sumImpl[i].clear(); 349 sumLogImpl[i].clear(); 350 sumSqImpl[i].clear(); 351 geoMeanImpl[i].clear(); 352 meanImpl[i].clear(); 353 } 354 covarianceImpl.clear(); 355 } 356 357 /** 358 * Returns true iff <code>object</code> is a <code>SummaryStatistics</code> 359 * instance and all statistics have the same values as this. 360 * @param object the object to test equality against. 361 * @return true if object equals this 362 */ 363 @Override 364 public boolean equals(Object object) { 365 if (object == this ) { 366 return true; 367 } 368 if (object instanceof MultivariateSummaryStatistics == false) { 369 return false; 370 } 371 MultivariateSummaryStatistics stat = (MultivariateSummaryStatistics) object; 372 return MathUtils.equals(stat.getGeometricMean(), getGeometricMean()) && 373 MathUtils.equals(stat.getMax(), getMax()) && 374 MathUtils.equals(stat.getMean(), getMean()) && 375 MathUtils.equals(stat.getMin(), getMin()) && 376 MathUtils.equals(stat.getN(), getN()) && 377 MathUtils.equals(stat.getSum(), getSum()) && 378 MathUtils.equals(stat.getSumSq(), getSumSq()) && 379 MathUtils.equals(stat.getSumLog(), getSumLog()) && 380 stat.getCovariance().equals( getCovariance()); 381 } 382 383 /** 384 * Returns hash code based on values of statistics 385 * 386 * @return hash code 387 */ 388 @Override 389 public int hashCode() { 390 int result = 31 + MathUtils.hash(getGeometricMean()); 391 result = result * 31 + MathUtils.hash(getGeometricMean()); 392 result = result * 31 + MathUtils.hash(getMax()); 393 result = result * 31 + MathUtils.hash(getMean()); 394 result = result * 31 + MathUtils.hash(getMin()); 395 result = result * 31 + MathUtils.hash(getN()); 396 result = result * 31 + MathUtils.hash(getSum()); 397 result = result * 31 + MathUtils.hash(getSumSq()); 398 result = result * 31 + MathUtils.hash(getSumLog()); 399 result = result * 31 + getCovariance().hashCode(); 400 return result; 401 } 402 403 // Getters and setters for statistics implementations 404 /** 405 * Sets statistics implementations. 406 * @param newImpl new implementations for statistics 407 * @param oldImpl old implementations for statistics 408 * @throws DimensionMismatchException if the array dimension 409 * does not match the one used at construction 410 * @throws IllegalStateException if data has already been added 411 * (i.e if n > 0) 412 */ 413 private void setImpl(StorelessUnivariateStatistic[] newImpl, 414 StorelessUnivariateStatistic[] oldImpl) 415 throws DimensionMismatchException, IllegalStateException { 416 checkEmpty(); 417 checkDimension(newImpl.length); 418 System.arraycopy(newImpl, 0, oldImpl, 0, newImpl.length); 419 } 420 421 /** 422 * Returns the currently configured Sum implementation 423 * 424 * @return the StorelessUnivariateStatistic implementing the sum 425 */ 426 public StorelessUnivariateStatistic[] getSumImpl() { 427 return sumImpl.clone(); 428 } 429 430 /** 431 * <p>Sets the implementation for the Sum.</p> 432 * <p>This method must be activated before any data has been added - i.e., 433 * before {@link #addValue(double[]) addValue} has been used to add data; 434 * otherwise an IllegalStateException will be thrown.</p> 435 * 436 * @param sumImpl the StorelessUnivariateStatistic instance to use 437 * for computing the Sum 438 * @throws DimensionMismatchException if the array dimension 439 * does not match the one used at construction 440 * @throws IllegalStateException if data has already been added 441 * (i.e if n > 0) 442 */ 443 public void setSumImpl(StorelessUnivariateStatistic[] sumImpl) 444 throws DimensionMismatchException { 445 setImpl(sumImpl, this.sumImpl); 446 } 447 448 /** 449 * Returns the currently configured sum of squares implementation 450 * 451 * @return the StorelessUnivariateStatistic implementing the sum of squares 452 */ 453 public StorelessUnivariateStatistic[] getSumsqImpl() { 454 return sumSqImpl.clone(); 455 } 456 457 /** 458 * <p>Sets the implementation for the sum of squares.</p> 459 * <p>This method must be activated before any data has been added - i.e., 460 * before {@link #addValue(double[]) addValue} has been used to add data; 461 * otherwise an IllegalStateException will be thrown.</p> 462 * 463 * @param sumsqImpl the StorelessUnivariateStatistic instance to use 464 * for computing the sum of squares 465 * @throws DimensionMismatchException if the array dimension 466 * does not match the one used at construction 467 * @throws IllegalStateException if data has already been added 468 * (i.e if n > 0) 469 */ 470 public void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl) 471 throws DimensionMismatchException { 472 setImpl(sumsqImpl, this.sumSqImpl); 473 } 474 475 /** 476 * Returns the currently configured minimum implementation 477 * 478 * @return the StorelessUnivariateStatistic implementing the minimum 479 */ 480 public StorelessUnivariateStatistic[] getMinImpl() { 481 return minImpl.clone(); 482 } 483 484 /** 485 * <p>Sets the implementation for the minimum.</p> 486 * <p>This method must be activated before any data has been added - i.e., 487 * before {@link #addValue(double[]) addValue} has been used to add data; 488 * otherwise an IllegalStateException will be thrown.</p> 489 * 490 * @param minImpl the StorelessUnivariateStatistic instance to use 491 * for computing the minimum 492 * @throws DimensionMismatchException if the array dimension 493 * does not match the one used at construction 494 * @throws IllegalStateException if data has already been added 495 * (i.e if n > 0) 496 */ 497 public void setMinImpl(StorelessUnivariateStatistic[] minImpl) 498 throws DimensionMismatchException { 499 setImpl(minImpl, this.minImpl); 500 } 501 502 /** 503 * Returns the currently configured maximum implementation 504 * 505 * @return the StorelessUnivariateStatistic implementing the maximum 506 */ 507 public StorelessUnivariateStatistic[] getMaxImpl() { 508 return maxImpl.clone(); 509 } 510 511 /** 512 * <p>Sets the implementation for the maximum.</p> 513 * <p>This method must be activated before any data has been added - i.e., 514 * before {@link #addValue(double[]) addValue} has been used to add data; 515 * otherwise an IllegalStateException will be thrown.</p> 516 * 517 * @param maxImpl the StorelessUnivariateStatistic instance to use 518 * for computing the maximum 519 * @throws DimensionMismatchException if the array dimension 520 * does not match the one used at construction 521 * @throws IllegalStateException if data has already been added 522 * (i.e if n > 0) 523 */ 524 public void setMaxImpl(StorelessUnivariateStatistic[] maxImpl) 525 throws DimensionMismatchException { 526 setImpl(maxImpl, this.maxImpl); 527 } 528 529 /** 530 * Returns the currently configured sum of logs implementation 531 * 532 * @return the StorelessUnivariateStatistic implementing the log sum 533 */ 534 public StorelessUnivariateStatistic[] getSumLogImpl() { 535 return sumLogImpl.clone(); 536 } 537 538 /** 539 * <p>Sets the implementation for the sum of logs.</p> 540 * <p>This method must be activated before any data has been added - i.e., 541 * before {@link #addValue(double[]) addValue} has been used to add data; 542 * otherwise an IllegalStateException will be thrown.</p> 543 * 544 * @param sumLogImpl the StorelessUnivariateStatistic instance to use 545 * for computing the log sum 546 * @throws DimensionMismatchException if the array dimension 547 * does not match the one used at construction 548 * @throws IllegalStateException if data has already been added 549 * (i.e if n > 0) 550 */ 551 public void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl) 552 throws DimensionMismatchException { 553 setImpl(sumLogImpl, this.sumLogImpl); 554 } 555 556 /** 557 * Returns the currently configured geometric mean implementation 558 * 559 * @return the StorelessUnivariateStatistic implementing the geometric mean 560 */ 561 public StorelessUnivariateStatistic[] getGeoMeanImpl() { 562 return geoMeanImpl.clone(); 563 } 564 565 /** 566 * <p>Sets the implementation for the geometric mean.</p> 567 * <p>This method must be activated before any data has been added - i.e., 568 * before {@link #addValue(double[]) addValue} has been used to add data; 569 * otherwise an IllegalStateException will be thrown.</p> 570 * 571 * @param geoMeanImpl the StorelessUnivariateStatistic instance to use 572 * for computing the geometric mean 573 * @throws DimensionMismatchException if the array dimension 574 * does not match the one used at construction 575 * @throws IllegalStateException if data has already been added 576 * (i.e if n > 0) 577 */ 578 public void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl) 579 throws DimensionMismatchException { 580 setImpl(geoMeanImpl, this.geoMeanImpl); 581 } 582 583 /** 584 * Returns the currently configured mean implementation 585 * 586 * @return the StorelessUnivariateStatistic implementing the mean 587 */ 588 public StorelessUnivariateStatistic[] getMeanImpl() { 589 return meanImpl.clone(); 590 } 591 592 /** 593 * <p>Sets the implementation for the mean.</p> 594 * <p>This method must be activated before any data has been added - i.e., 595 * before {@link #addValue(double[]) addValue} has been used to add data; 596 * otherwise an IllegalStateException will be thrown.</p> 597 * 598 * @param meanImpl the StorelessUnivariateStatistic instance to use 599 * for computing the mean 600 * @throws DimensionMismatchException if the array dimension 601 * does not match the one used at construction 602 * @throws IllegalStateException if data has already been added 603 * (i.e if n > 0) 604 */ 605 public void setMeanImpl(StorelessUnivariateStatistic[] meanImpl) 606 throws DimensionMismatchException { 607 setImpl(meanImpl, this.meanImpl); 608 } 609 610 /** 611 * Throws IllegalStateException if n > 0. 612 */ 613 private void checkEmpty() { 614 if (n > 0) { 615 throw MathRuntimeException.createIllegalStateException( 616 "{0} values have been added before statistic is configured", 617 n); 618 } 619 } 620 621 /** 622 * Throws DimensionMismatchException if dimension != k. 623 * @param dimension dimension to check 624 * @throws DimensionMismatchException if dimension != k 625 */ 626 private void checkDimension(int dimension) 627 throws DimensionMismatchException { 628 if (dimension != k) { 629 throw new DimensionMismatchException(dimension, k); 630 } 631 } 632 633 }