001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math.stat.descriptive.moment;
018    
019    import java.io.Serializable;
020    
021    import org.apache.commons.math.MathRuntimeException;
022    import org.apache.commons.math.stat.descriptive.WeightedEvaluation;
023    import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic;
024    
025    /**
026     * Computes the variance of the available values.  By default, the unbiased
027     * "sample variance" definitional formula is used:
028     * <p>
029     * variance = sum((x_i - mean)^2) / (n - 1) </p>
030     * <p>
031     * where mean is the {@link Mean} and <code>n</code> is the number
032     * of sample observations.</p>
033     * <p>
034     * The definitional formula does not have good numerical properties, so
035     * this implementation does not compute the statistic using the definitional
036     * formula. <ul>
037     * <li> The <code>getResult</code> method computes the variance using
038     * updating formulas based on West's algorithm, as described in
039     * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
040     * J. G. Lewis 1979, <i>Communications of the ACM</i>,
041     * vol. 22 no. 9, pp. 526-531.</a></li>
042     * <li> The <code>evaluate</code> methods leverage the fact that they have the
043     * full array of values in memory to execute a two-pass algorithm.
044     * Specifically, these methods use the "corrected two-pass algorithm" from
045     * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
046     * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
047     * Note that adding values using <code>increment</code> or
048     * <code>incrementAll</code> and then executing <code>getResult</code> will
049     * sometimes give a different, less accurate, result than executing
050     * <code>evaluate</code> with the full array of values. The former approach
051     * should only be used when the full array of values is not available.</p>
052     * <p>
053     * The "population variance"  ( sum((x_i - mean)^2) / n ) can also
054     * be computed using this statistic.  The <code>isBiasCorrected</code>
055     * property determines whether the "population" or "sample" value is
056     * returned by the <code>evaluate</code> and <code>getResult</code> methods.
057     * To compute population variances, set this property to <code>false.</code>
058     * </p>
059     * <p>
060     * <strong>Note that this implementation is not synchronized.</strong> If
061     * multiple threads access an instance of this class concurrently, and at least
062     * one of the threads invokes the <code>increment()</code> or
063     * <code>clear()</code> method, it must be synchronized externally.</p>
064     *
065     * @version $Revision: 908626 $ $Date: 2010-02-10 13:44:42 -0500 (Wed, 10 Feb 2010) $
066     */
067    public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation {
068    
069        /** Serializable version identifier */
070        private static final long serialVersionUID = -9111962718267217978L;
071    
072        /** SecondMoment is used in incremental calculation of Variance*/
073        protected SecondMoment moment = null;
074    
075        /**
076         * Boolean test to determine if this Variance should also increment
077         * the second moment, this evaluates to false when this Variance is
078         * constructed with an external SecondMoment as a parameter.
079         */
080        protected boolean incMoment = true;
081    
082        /**
083         * Determines whether or not bias correction is applied when computing the
084         * value of the statisic.  True means that bias is corrected.  See
085         * {@link Variance} for details on the formula.
086         */
087        private boolean isBiasCorrected = true;
088    
089        /**
090         * Constructs a Variance with default (true) <code>isBiasCorrected</code>
091         * property.
092         */
093        public Variance() {
094            moment = new SecondMoment();
095        }
096    
097        /**
098         * Constructs a Variance based on an external second moment.
099         *
100         * @param m2 the SecondMoment (Third or Fourth moments work
101         * here as well.)
102         */
103        public Variance(final SecondMoment m2) {
104            incMoment = false;
105            this.moment = m2;
106        }
107    
108        /**
109         * Constructs a Variance with the specified <code>isBiasCorrected</code>
110         * property
111         *
112         * @param isBiasCorrected  setting for bias correction - true means
113         * bias will be corrected and is equivalent to using the argumentless
114         * constructor
115         */
116        public Variance(boolean isBiasCorrected) {
117            moment = new SecondMoment();
118            this.isBiasCorrected = isBiasCorrected;
119        }
120    
121        /**
122         * Constructs a Variance with the specified <code>isBiasCorrected</code>
123         * property and the supplied external second moment.
124         *
125         * @param isBiasCorrected  setting for bias correction - true means
126         * bias will be corrected
127         * @param m2 the SecondMoment (Third or Fourth moments work
128         * here as well.)
129         */
130        public Variance(boolean isBiasCorrected, SecondMoment m2) {
131            incMoment = false;
132            this.moment = m2;
133            this.isBiasCorrected = isBiasCorrected;
134        }
135    
136        /**
137         * Copy constructor, creates a new {@code Variance} identical
138         * to the {@code original}
139         *
140         * @param original the {@code Variance} instance to copy
141         */
142        public Variance(Variance original) {
143            copy(original, this);
144        }
145    
146        /**
147         * {@inheritDoc}
148         * <p>If all values are available, it is more accurate to use
149         * {@link #evaluate(double[])} rather than adding values one at a time
150         * using this method and then executing {@link #getResult}, since
151         * <code>evaluate</code> leverages the fact that is has the full
152         * list of values together to execute a two-pass algorithm.
153         * See {@link Variance}.</p>
154         */
155        @Override
156        public void increment(final double d) {
157            if (incMoment) {
158                moment.increment(d);
159            }
160        }
161    
162        /**
163         * {@inheritDoc}
164         */
165        @Override
166        public double getResult() {
167                if (moment.n == 0) {
168                    return Double.NaN;
169                } else if (moment.n == 1) {
170                    return 0d;
171                } else {
172                    if (isBiasCorrected) {
173                        return moment.m2 / (moment.n - 1d);
174                    } else {
175                        return moment.m2 / (moment.n);
176                    }
177                }
178        }
179    
180        /**
181         * {@inheritDoc}
182         */
183        public long getN() {
184            return moment.getN();
185        }
186    
187        /**
188         * {@inheritDoc}
189         */
190        @Override
191        public void clear() {
192            if (incMoment) {
193                moment.clear();
194            }
195        }
196    
197        /**
198         * Returns the variance of the entries in the input array, or
199         * <code>Double.NaN</code> if the array is empty.
200         * <p>
201         * See {@link Variance} for details on the computing algorithm.</p>
202         * <p>
203         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
204         * <p>
205         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
206         * <p>
207         * Does not change the internal state of the statistic.</p>
208         *
209         * @param values the input array
210         * @return the variance of the values or Double.NaN if length = 0
211         * @throws IllegalArgumentException if the array is null
212         */
213        @Override
214        public double evaluate(final double[] values) {
215            if (values == null) {
216                throw MathRuntimeException.createIllegalArgumentException("input values array is null");
217            }
218            return evaluate(values, 0, values.length);
219        }
220    
221        /**
222         * Returns the variance of the entries in the specified portion of
223         * the input array, or <code>Double.NaN</code> if the designated subarray
224         * is empty.
225         * <p>
226         * See {@link Variance} for details on the computing algorithm.</p>
227         * <p>
228         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
229         * <p>
230         * Does not change the internal state of the statistic.</p>
231         * <p>
232         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
233         *
234         * @param values the input array
235         * @param begin index of the first array element to include
236         * @param length the number of elements to include
237         * @return the variance of the values or Double.NaN if length = 0
238         * @throws IllegalArgumentException if the array is null or the array index
239         *  parameters are not valid
240         */
241        @Override
242        public double evaluate(final double[] values, final int begin, final int length) {
243    
244            double var = Double.NaN;
245    
246            if (test(values, begin, length)) {
247                clear();
248                if (length == 1) {
249                    var = 0.0;
250                } else if (length > 1) {
251                    Mean mean = new Mean();
252                    double m = mean.evaluate(values, begin, length);
253                    var = evaluate(values, m, begin, length);
254                }
255            }
256            return var;
257        }
258    
259        /**
260         * <p>Returns the weighted variance of the entries in the specified portion of
261         * the input array, or <code>Double.NaN</code> if the designated subarray
262         * is empty.</p>
263         * <p>
264         * Uses the formula <pre>
265         *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
266         * </pre>
267         * where weightedMean is the weighted mean</p>
268         * <p>
269         * This formula will not return the same result as the unweighted variance when all
270         * weights are equal, unless all weights are equal to 1. The formula assumes that
271         * weights are to be treated as "expansion values," as will be the case if for example
272         * the weights represent frequency counts. To normalize weights so that the denominator
273         * in the variance computation equals the length of the input vector minus one, use <pre>
274         *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code>
275         * </pre>
276         * <p>
277         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
278         * <p>
279         * Throws <code>IllegalArgumentException</code> if any of the following are true:
280         * <ul><li>the values array is null</li>
281         *     <li>the weights array is null</li>
282         *     <li>the weights array does not have the same length as the values array</li>
283         *     <li>the weights array contains one or more infinite values</li>
284         *     <li>the weights array contains one or more NaN values</li>
285         *     <li>the weights array contains negative values</li>
286         *     <li>the start and length arguments do not determine a valid array</li>
287         * </ul></p>
288         * <p>
289         * Does not change the internal state of the statistic.</p>
290         * <p>
291         * Throws <code>IllegalArgumentException</code> if either array is null.</p>
292         *
293         * @param values the input array
294         * @param weights the weights array
295         * @param begin index of the first array element to include
296         * @param length the number of elements to include
297         * @return the weighted variance of the values or Double.NaN if length = 0
298         * @throws IllegalArgumentException if the parameters are not valid
299         * @since 2.1
300         */
301        public double evaluate(final double[] values, final double[] weights,
302                               final int begin, final int length) {
303    
304            double var = Double.NaN;
305    
306            if (test(values, weights,begin, length)) {
307                clear();
308                if (length == 1) {
309                    var = 0.0;
310                } else if (length > 1) {
311                    Mean mean = new Mean();
312                    double m = mean.evaluate(values, weights, begin, length);
313                    var = evaluate(values, weights, m, begin, length);
314                }
315            }
316            return var;
317        }
318    
319        /**
320         * <p>
321         * Returns the weighted variance of the entries in the the input array.</p>
322         * <p>
323         * Uses the formula <pre>
324         *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
325         * </pre>
326         * where weightedMean is the weighted mean</p>
327         * <p>
328         * This formula will not return the same result as the unweighted variance when all
329         * weights are equal, unless all weights are equal to 1. The formula assumes that
330         * weights are to be treated as "expansion values," as will be the case if for example
331         * the weights represent frequency counts. To normalize weights so that the denominator
332         * in the variance computation equals the length of the input vector minus one, use <pre>
333         *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code>
334         * </pre>
335         * <p>
336         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
337         * <p>
338         * Throws <code>IllegalArgumentException</code> if any of the following are true:
339         * <ul><li>the values array is null</li>
340         *     <li>the weights array is null</li>
341         *     <li>the weights array does not have the same length as the values array</li>
342         *     <li>the weights array contains one or more infinite values</li>
343         *     <li>the weights array contains one or more NaN values</li>
344         *     <li>the weights array contains negative values</li>
345         * </ul></p>
346         * <p>
347         * Does not change the internal state of the statistic.</p>
348         * <p>
349         * Throws <code>IllegalArgumentException</code> if either array is null.</p>
350         *
351         * @param values the input array
352         * @param weights the weights array
353         * @return the weighted variance of the values
354         * @throws IllegalArgumentException if the parameters are not valid
355         * @since 2.1
356         */
357        public double evaluate(final double[] values, final double[] weights) {
358            return evaluate(values, weights, 0, values.length);
359        }
360    
361        /**
362         * Returns the variance of the entries in the specified portion of
363         * the input array, using the precomputed mean value.  Returns
364         * <code>Double.NaN</code> if the designated subarray is empty.
365         * <p>
366         * See {@link Variance} for details on the computing algorithm.</p>
367         * <p>
368         * The formula used assumes that the supplied mean value is the arithmetic
369         * mean of the sample data, not a known population parameter.  This method
370         * is supplied only to save computation when the mean has already been
371         * computed.</p>
372         * <p>
373         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
374         * <p>
375         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
376         * <p>
377         * Does not change the internal state of the statistic.</p>
378         *
379         * @param values the input array
380         * @param mean the precomputed mean value
381         * @param begin index of the first array element to include
382         * @param length the number of elements to include
383         * @return the variance of the values or Double.NaN if length = 0
384         * @throws IllegalArgumentException if the array is null or the array index
385         *  parameters are not valid
386         */
387        public double evaluate(final double[] values, final double mean,
388                final int begin, final int length) {
389    
390            double var = Double.NaN;
391    
392            if (test(values, begin, length)) {
393                if (length == 1) {
394                    var = 0.0;
395                } else if (length > 1) {
396                    double accum = 0.0;
397                    double dev = 0.0;
398                    double accum2 = 0.0;
399                    for (int i = begin; i < begin + length; i++) {
400                        dev = values[i] - mean;
401                        accum += dev * dev;
402                        accum2 += dev;
403                    }
404                    double len = length;
405                    if (isBiasCorrected) {
406                        var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
407                    } else {
408                        var = (accum - (accum2 * accum2 / len)) / len;
409                    }
410                }
411            }
412            return var;
413        }
414    
415        /**
416         * Returns the variance of the entries in the input array, using the
417         * precomputed mean value.  Returns <code>Double.NaN</code> if the array
418         * is empty.
419         * <p>
420         * See {@link Variance} for details on the computing algorithm.</p>
421         * <p>
422         * If <code>isBiasCorrected</code> is <code>true</code> the formula used
423         * assumes that the supplied mean value is the arithmetic mean of the
424         * sample data, not a known population parameter.  If the mean is a known
425         * population parameter, or if the "population" version of the variance is
426         * desired, set <code>isBiasCorrected</code> to <code>false</code> before
427         * invoking this method.</p>
428         * <p>
429         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
430         * <p>
431         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
432         * <p>
433         * Does not change the internal state of the statistic.</p>
434         *
435         * @param values the input array
436         * @param mean the precomputed mean value
437         * @return the variance of the values or Double.NaN if the array is empty
438         * @throws IllegalArgumentException if the array is null
439         */
440        public double evaluate(final double[] values, final double mean) {
441            return evaluate(values, mean, 0, values.length);
442        }
443    
444        /**
445         * Returns the weighted variance of the entries in the specified portion of
446         * the input array, using the precomputed weighted mean value.  Returns
447         * <code>Double.NaN</code> if the designated subarray is empty.
448         * <p>
449         * Uses the formula <pre>
450         *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
451         * </pre></p>
452         * <p>
453         * The formula used assumes that the supplied mean value is the weighted arithmetic
454         * mean of the sample data, not a known population parameter. This method
455         * is supplied only to save computation when the mean has already been
456         * computed.</p>
457         * <p>
458         * This formula will not return the same result as the unweighted variance when all
459         * weights are equal, unless all weights are equal to 1. The formula assumes that
460         * weights are to be treated as "expansion values," as will be the case if for example
461         * the weights represent frequency counts. To normalize weights so that the denominator
462         * in the variance computation equals the length of the input vector minus one, use <pre>
463         *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code>
464         * </pre>
465         * <p>
466         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
467         * <p>
468         * Throws <code>IllegalArgumentException</code> if any of the following are true:
469         * <ul><li>the values array is null</li>
470         *     <li>the weights array is null</li>
471         *     <li>the weights array does not have the same length as the values array</li>
472         *     <li>the weights array contains one or more infinite values</li>
473         *     <li>the weights array contains one or more NaN values</li>
474         *     <li>the weights array contains negative values</li>
475         *     <li>the start and length arguments do not determine a valid array</li>
476         * </ul></p>
477         * <p>
478         * Does not change the internal state of the statistic.</p>
479         *
480         * @param values the input array
481         * @param weights the weights array
482         * @param mean the precomputed weighted mean value
483         * @param begin index of the first array element to include
484         * @param length the number of elements to include
485         * @return the variance of the values or Double.NaN if length = 0
486         * @throws IllegalArgumentException if the parameters are not valid
487         * @since 2.1
488         */
489        public double evaluate(final double[] values, final double[] weights,
490                               final double mean, final int begin, final int length) {
491    
492            double var = Double.NaN;
493    
494            if (test(values, weights, begin, length)) {
495                if (length == 1) {
496                    var = 0.0;
497                } else if (length > 1) {
498                    double accum = 0.0;
499                    double dev = 0.0;
500                    double accum2 = 0.0;
501                    for (int i = begin; i < begin + length; i++) {
502                        dev = values[i] - mean;
503                        accum += weights[i] * (dev * dev);
504                        accum2 += weights[i] * dev;
505                    }
506    
507                    double sumWts = 0;
508                    for (int i = 0; i < weights.length; i++) {
509                        sumWts += weights[i];
510                    }
511    
512                    if (isBiasCorrected) {
513                        var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
514                    } else {
515                        var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
516                    }
517                }
518            }
519            return var;
520        }
521    
522        /**
523         * <p>Returns the weighted variance of the values in the input array, using
524         * the precomputed weighted mean value.</p>
525         * <p>
526         * Uses the formula <pre>
527         *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
528         * </pre></p>
529         * <p>
530         * The formula used assumes that the supplied mean value is the weighted arithmetic
531         * mean of the sample data, not a known population parameter. This method
532         * is supplied only to save computation when the mean has already been
533         * computed.</p>
534         * <p>
535         * This formula will not return the same result as the unweighted variance when all
536         * weights are equal, unless all weights are equal to 1. The formula assumes that
537         * weights are to be treated as "expansion values," as will be the case if for example
538         * the weights represent frequency counts. To normalize weights so that the denominator
539         * in the variance computation equals the length of the input vector minus one, use <pre>
540         *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code>
541         * </pre>
542         * <p>
543         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
544         * <p>
545         * Throws <code>IllegalArgumentException</code> if any of the following are true:
546         * <ul><li>the values array is null</li>
547         *     <li>the weights array is null</li>
548         *     <li>the weights array does not have the same length as the values array</li>
549         *     <li>the weights array contains one or more infinite values</li>
550         *     <li>the weights array contains one or more NaN values</li>
551         *     <li>the weights array contains negative values</li>
552         * </ul></p>
553         * <p>
554         * Does not change the internal state of the statistic.</p>
555         *
556         * @param values the input array
557         * @param weights the weights array
558         * @param mean the precomputed weighted mean value
559         * @return the variance of the values or Double.NaN if length = 0
560         * @throws IllegalArgumentException if the parameters are not valid
561         * @since 2.1
562         */
563        public double evaluate(final double[] values, final double[] weights, final double mean) {
564            return evaluate(values, weights, mean, 0, values.length);
565        }
566    
567        /**
568         * @return Returns the isBiasCorrected.
569         */
570        public boolean isBiasCorrected() {
571            return isBiasCorrected;
572        }
573    
574        /**
575         * @param biasCorrected The isBiasCorrected to set.
576         */
577        public void setBiasCorrected(boolean biasCorrected) {
578            this.isBiasCorrected = biasCorrected;
579        }
580    
581        /**
582         * {@inheritDoc}
583         */
584        @Override
585        public Variance copy() {
586            Variance result = new Variance();
587            copy(this, result);
588            return result;
589        }
590    
591    
592        /**
593         * Copies source to dest.
594         * <p>Neither source nor dest can be null.</p>
595         *
596         * @param source Variance to copy
597         * @param dest Variance to copy to
598         * @throws NullPointerException if either source or dest is null
599         */
600        public static void copy(Variance source, Variance dest) {
601            dest.moment = source.moment.copy();
602            dest.isBiasCorrected = source.isBiasCorrected;
603            dest.incMoment = source.incMoment;
604        }
605    
606    }