001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math.stat.inference;
018    
019    import java.util.Collection;
020    
021    import org.apache.commons.math.MathException;
022    import org.apache.commons.math.MathRuntimeException;
023    import org.apache.commons.math.distribution.FDistribution;
024    import org.apache.commons.math.distribution.FDistributionImpl;
025    import org.apache.commons.math.stat.descriptive.summary.Sum;
026    import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
027    
028    
029    /**
030     * Implements one-way ANOVA statistics defined in the {@link OneWayAnovaImpl}
031     * interface.
032     *
033     * <p>Uses the
034     * {@link org.apache.commons.math.distribution.FDistribution
035     *  commons-math F Distribution implementation} to estimate exact p-values.</p>
036     *
037     * <p>This implementation is based on a description at
038     * http://faculty.vassar.edu/lowry/ch13pt1.html</p>
039     * <pre>
040     * Abbreviations: bg = between groups,
041     *                wg = within groups,
042     *                ss = sum squared deviations
043     * </pre>
044     *
045     * @since 1.2
046     * @version $Revision: 825917 $ $Date: 2009-10-16 10:47:27 -0400 (Fri, 16 Oct 2009) $
047     */
048    public class OneWayAnovaImpl implements OneWayAnova  {
049    
050        /**
051         * Default constructor.
052         */
053        public OneWayAnovaImpl() {
054        }
055    
056        /**
057         * {@inheritDoc}<p>
058         * This implementation computes the F statistic using the definitional
059         * formula<pre>
060         *   F = msbg/mswg</pre>
061         * where<pre>
062         *  msbg = between group mean square
063         *  mswg = within group mean square</pre>
064         * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
065         * here</a></p>
066         */
067        public double anovaFValue(Collection<double[]> categoryData)
068            throws IllegalArgumentException, MathException {
069            AnovaStats a = anovaStats(categoryData);
070            return a.F;
071        }
072    
073        /**
074         * {@inheritDoc}<p>
075         * This implementation uses the
076         * {@link org.apache.commons.math.distribution.FDistribution
077         * commons-math F Distribution implementation} to estimate the exact
078         * p-value, using the formula<pre>
079         *   p = 1 - cumulativeProbability(F)</pre>
080         * where <code>F</code> is the F value and <code>cumulativeProbability</code>
081         * is the commons-math implementation of the F distribution.</p>
082         */
083        public double anovaPValue(Collection<double[]> categoryData)
084            throws IllegalArgumentException, MathException {
085            AnovaStats a = anovaStats(categoryData);
086            FDistribution fdist = new FDistributionImpl(a.dfbg, a.dfwg);
087            return 1.0 - fdist.cumulativeProbability(a.F);
088        }
089    
090        /**
091         * {@inheritDoc}<p>
092         * This implementation uses the
093         * {@link org.apache.commons.math.distribution.FDistribution
094         * commons-math F Distribution implementation} to estimate the exact
095         * p-value, using the formula<pre>
096         *   p = 1 - cumulativeProbability(F)</pre>
097         * where <code>F</code> is the F value and <code>cumulativeProbability</code>
098         * is the commons-math implementation of the F distribution.</p>
099         * <p>True is returned iff the estimated p-value is less than alpha.</p>
100         */
101        public boolean anovaTest(Collection<double[]> categoryData, double alpha)
102            throws IllegalArgumentException, MathException {
103            if ((alpha <= 0) || (alpha > 0.5)) {
104                throw MathRuntimeException.createIllegalArgumentException(
105                      "out of bounds significance level {0}, must be between {1} and {2}",
106                      alpha, 0, 0.5);
107            }
108            return anovaPValue(categoryData) < alpha;
109        }
110    
111    
112        /**
113         * This method actually does the calculations (except P-value).
114         *
115         * @param categoryData <code>Collection</code> of <code>double[]</code>
116         * arrays each containing data for one category
117         * @return computed AnovaStats
118         * @throws IllegalArgumentException if categoryData does not meet
119         * preconditions specified in the interface definition
120         * @throws MathException if an error occurs computing the Anova stats
121         */
122        private AnovaStats anovaStats(Collection<double[]> categoryData)
123            throws IllegalArgumentException, MathException {
124    
125            // check if we have enough categories
126            if (categoryData.size() < 2) {
127                throw MathRuntimeException.createIllegalArgumentException(
128                      "two or more categories required, got {0}",
129                      categoryData.size());
130            }
131    
132            // check if each category has enough data and all is double[]
133            for (double[] array : categoryData) {
134                if (array.length <= 1) {
135                    throw MathRuntimeException.createIllegalArgumentException(
136                          "two or more values required in each category, one has {0}",
137                          array.length);
138                }
139            }
140    
141            int dfwg = 0;
142            double sswg = 0;
143            Sum totsum = new Sum();
144            SumOfSquares totsumsq = new SumOfSquares();
145            int totnum = 0;
146    
147            for (double[] data : categoryData) {
148    
149                Sum sum = new Sum();
150                SumOfSquares sumsq = new SumOfSquares();
151                int num = 0;
152    
153                for (int i = 0; i < data.length; i++) {
154                    double val = data[i];
155    
156                    // within category
157                    num++;
158                    sum.increment(val);
159                    sumsq.increment(val);
160    
161                    // for all categories
162                    totnum++;
163                    totsum.increment(val);
164                    totsumsq.increment(val);
165                }
166                dfwg += num - 1;
167                double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num;
168                sswg += ss;
169            }
170            double sst = totsumsq.getResult() - totsum.getResult() *
171                totsum.getResult()/totnum;
172            double ssbg = sst - sswg;
173            int dfbg = categoryData.size() - 1;
174            double msbg = ssbg/dfbg;
175            double mswg = sswg/dfwg;
176            double F = msbg/mswg;
177    
178            return new AnovaStats(dfbg, dfwg, F);
179        }
180    
181        /**
182            Convenience class to pass dfbg,dfwg,F values around within AnovaImpl.
183            No get/set methods provided.
184        */
185        private static class AnovaStats {
186    
187            /** Degrees of freedom in numerator (between groups). */
188            private int dfbg;
189    
190            /** Degrees of freedom in denominator (within groups). */
191            private int dfwg;
192    
193            /** Statistic. */
194            private double F;
195    
196            /**
197             * Constructor
198             * @param dfbg degrees of freedom in numerator (between groups)
199             * @param dfwg degrees of freedom in denominator (within groups)
200             * @param F statistic
201             */
202            private AnovaStats(int dfbg, int dfwg, double F) {
203                this.dfbg = dfbg;
204                this.dfwg = dfwg;
205                this.F = F;
206            }
207        }
208    
209    }