001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.math.random;
019    
020    import java.io.BufferedReader;
021    import java.io.File;
022    import java.io.FileReader;
023    import java.io.IOException;
024    import java.io.InputStreamReader;
025    import java.io.Serializable;
026    import java.net.URL;
027    import java.util.ArrayList;
028    import java.util.List;
029    
030    import org.apache.commons.math.MathRuntimeException;
031    import org.apache.commons.math.stat.descriptive.StatisticalSummary;
032    import org.apache.commons.math.stat.descriptive.SummaryStatistics;
033    
034    /**
035     * Implements <code>EmpiricalDistribution</code> interface.  This implementation
036     * uses what amounts to the
037     * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
038     * Variable Kernel Method</a> with Gaussian smoothing:<p>
039     * <strong>Digesting the input file</strong>
040     * <ol><li>Pass the file once to compute min and max.</li>
041     * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
042     * <li>Pass the data file again, computing bin counts and univariate
043     *     statistics (mean, std dev.) for each of the bins </li>
044     * <li>Divide the interval (0,1) into subintervals associated with the bins,
045     *     with the length of a bin's subinterval proportional to its count.</li></ol>
046     * <strong>Generating random values from the distribution</strong><ol>
047     * <li>Generate a uniformly distributed value in (0,1) </li>
048     * <li>Select the subinterval to which the value belongs.
049     * <li>Generate a random Gaussian value with mean = mean of the associated
050     *     bin and std dev = std dev of associated bin.</li></ol></p><p>
051     *<strong>USAGE NOTES:</strong><ul>
052     *<li>The <code>binCount</code> is set by default to 1000.  A good rule of thumb
053     *    is to set the bin count to approximately the length of the input file divided
054     *    by 10. </li>
055     *<li>The input file <i>must</i> be a plain text file containing one valid numeric
056     *    entry per line.</li>
057     * </ul></p>
058     *
059     * @version $Revision: 925812 $ $Date: 2010-03-21 11:49:31 -0400 (Sun, 21 Mar 2010) $
060     */
061    public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
062    
063        /** Serializable version identifier */
064        private static final long serialVersionUID = 5729073523949762654L;
065    
066        /** List of SummaryStatistics objects characterizing the bins */
067        private List<SummaryStatistics> binStats = null;
068    
069        /** Sample statistics */
070        private SummaryStatistics sampleStats = null;
071    
072        /** Max loaded value */
073        private double max = Double.NEGATIVE_INFINITY;
074    
075        /** Min loaded value */
076        private double min = Double.POSITIVE_INFINITY;
077    
078        /** Grid size */
079        private double delta = 0d;
080    
081        /** number of bins */
082        private int binCount = 1000;
083    
084        /** is the distribution loaded? */
085        private boolean loaded = false;
086    
087        /** upper bounds of subintervals in (0,1) "belonging" to the bins */
088        private double[] upperBounds = null;
089    
090        /** RandomData instance to use in repeated calls to getNext() */
091        private RandomData randomData = new RandomDataImpl();
092    
093        /**
094         * Creates a new EmpiricalDistribution with the default bin count.
095         */
096        public EmpiricalDistributionImpl() {
097            binStats = new ArrayList<SummaryStatistics>();
098        }
099    
100        /**
101         * Creates a new EmpiricalDistribution  with the specified bin count.
102         *
103         * @param binCount number of bins
104         */
105        public EmpiricalDistributionImpl(int binCount) {
106            this.binCount = binCount;
107            binStats = new ArrayList<SummaryStatistics>();
108        }
109    
110         /**
111         * Computes the empirical distribution from the provided
112         * array of numbers.
113         *
114         * @param in the input data array
115         */
116        public void load(double[] in) {
117            DataAdapter da = new ArrayDataAdapter(in);
118            try {
119                da.computeStats();
120                fillBinStats(in);
121            } catch (IOException e) {
122                throw new MathRuntimeException(e);
123            }
124            loaded = true;
125    
126        }
127    
128        /**
129         * Computes the empirical distribution using data read from a URL.
130         * @param url  url of the input file
131         *
132         * @throws IOException if an IO error occurs
133         */
134        public void load(URL url) throws IOException {
135            BufferedReader in =
136                new BufferedReader(new InputStreamReader(url.openStream()));
137            try {
138                DataAdapter da = new StreamDataAdapter(in);
139                da.computeStats();
140                if (sampleStats.getN() == 0) {
141                    throw MathRuntimeException.createEOFException("URL {0} contains no data",
142                                                                  url);
143                }
144                in = new BufferedReader(new InputStreamReader(url.openStream()));
145                fillBinStats(in);
146                loaded = true;
147            } finally {
148               try {
149                   in.close();
150               } catch (IOException ex) {
151                   // ignore
152               }
153            }
154        }
155    
156        /**
157         * Computes the empirical distribution from the input file.
158         *
159         * @param file the input file
160         * @throws IOException if an IO error occurs
161         */
162        public void load(File file) throws IOException {
163            BufferedReader in = new BufferedReader(new FileReader(file));
164            try {
165                DataAdapter da = new StreamDataAdapter(in);
166                da.computeStats();
167                in = new BufferedReader(new FileReader(file));
168                fillBinStats(in);
169                loaded = true;
170            } finally {
171                try {
172                    in.close();
173                } catch (IOException ex) {
174                    // ignore
175                }
176            }
177        }
178    
179        /**
180         * Provides methods for computing <code>sampleStats</code> and
181         * <code>beanStats</code> abstracting the source of data.
182         */
183        private abstract class DataAdapter{
184    
185            /**
186             * Compute bin stats.
187             *
188             * @throws IOException  if an error occurs computing bin stats
189             */
190            public abstract void computeBinStats() throws IOException;
191    
192            /**
193             * Compute sample statistics.
194             *
195             * @throws IOException if an error occurs computing sample stats
196             */
197            public abstract void computeStats() throws IOException;
198    
199        }
200    
201        /**
202         * Factory of <code>DataAdapter</code> objects. For every supported source
203         * of data (array of doubles, file, etc.) an instance of the proper object
204         * is returned.
205         */
206        private class DataAdapterFactory{
207            /**
208             * Creates a DataAdapter from a data object
209             *
210             * @param in object providing access to the data
211             * @return DataAdapter instance
212             */
213            public DataAdapter getAdapter(Object in) {
214                if (in instanceof BufferedReader) {
215                    BufferedReader inputStream = (BufferedReader) in;
216                    return new StreamDataAdapter(inputStream);
217                } else if (in instanceof double[]) {
218                    double[] inputArray = (double[]) in;
219                    return new ArrayDataAdapter(inputArray);
220                } else {
221                    throw MathRuntimeException.createIllegalArgumentException(
222                          "input data comes from unsupported datasource: {0}, " +
223                          "supported sources: {1}, {2}",
224                          in.getClass().getName(),
225                          BufferedReader.class.getName(), double[].class.getName());
226                }
227            }
228        }
229        /**
230         * <code>DataAdapter</code> for data provided through some input stream
231         */
232        private class StreamDataAdapter extends DataAdapter{
233    
234            /** Input stream providing access to the data */
235            private BufferedReader inputStream;
236    
237            /**
238             * Create a StreamDataAdapter from a BufferedReader
239             *
240             * @param in BufferedReader input stream
241             */
242            public StreamDataAdapter(BufferedReader in){
243                super();
244                inputStream = in;
245            }
246    
247            /** {@inheritDoc} */
248            @Override
249            public void computeBinStats() throws IOException {
250                String str = null;
251                double val = 0.0d;
252                while ((str = inputStream.readLine()) != null) {
253                    val = Double.parseDouble(str);
254                    SummaryStatistics stats = binStats.get(findBin(val));
255                    stats.addValue(val);
256                }
257    
258                inputStream.close();
259                inputStream = null;
260            }
261    
262            /** {@inheritDoc} */
263            @Override
264            public void computeStats() throws IOException {
265                String str = null;
266                double val = 0.0;
267                sampleStats = new SummaryStatistics();
268                while ((str = inputStream.readLine()) != null) {
269                    val = Double.valueOf(str).doubleValue();
270                    sampleStats.addValue(val);
271                }
272                inputStream.close();
273                inputStream = null;
274            }
275        }
276    
277        /**
278         * <code>DataAdapter</code> for data provided as array of doubles.
279         */
280        private class ArrayDataAdapter extends DataAdapter {
281    
282            /** Array of input  data values */
283            private double[] inputArray;
284    
285            /**
286             * Construct an ArrayDataAdapter from a double[] array
287             *
288             * @param in double[] array holding the data
289             */
290            public ArrayDataAdapter(double[] in){
291                super();
292                inputArray = in;
293            }
294    
295            /** {@inheritDoc} */
296            @Override
297            public void computeStats() throws IOException {
298                sampleStats = new SummaryStatistics();
299                for (int i = 0; i < inputArray.length; i++) {
300                    sampleStats.addValue(inputArray[i]);
301                }
302            }
303    
304            /** {@inheritDoc} */
305            @Override
306            public void computeBinStats() throws IOException {
307                for (int i = 0; i < inputArray.length; i++) {
308                    SummaryStatistics stats =
309                        binStats.get(findBin(inputArray[i]));
310                    stats.addValue(inputArray[i]);
311                }
312            }
313        }
314    
315        /**
316         * Fills binStats array (second pass through data file).
317         *
318         * @param in object providing access to the data
319         * @throws IOException  if an IO error occurs
320         */
321        private void fillBinStats(Object in) throws IOException {
322            // Set up grid
323            min = sampleStats.getMin();
324            max = sampleStats.getMax();
325            delta = (max - min)/(Double.valueOf(binCount)).doubleValue();
326    
327            // Initialize binStats ArrayList
328            if (!binStats.isEmpty()) {
329                binStats.clear();
330            }
331            for (int i = 0; i < binCount; i++) {
332                SummaryStatistics stats = new SummaryStatistics();
333                binStats.add(i,stats);
334            }
335    
336            // Filling data in binStats Array
337            DataAdapterFactory aFactory = new DataAdapterFactory();
338            DataAdapter da = aFactory.getAdapter(in);
339            da.computeBinStats();
340    
341            // Assign upperBounds based on bin counts
342            upperBounds = new double[binCount];
343            upperBounds[0] =
344            ((double) binStats.get(0).getN()) / (double) sampleStats.getN();
345            for (int i = 1; i < binCount-1; i++) {
346                upperBounds[i] = upperBounds[i-1] +
347                ((double) binStats.get(i).getN()) / (double) sampleStats.getN();
348            }
349            upperBounds[binCount-1] = 1.0d;
350        }
351    
352        /**
353         * Returns the index of the bin to which the given value belongs
354         *
355         * @param value  the value whose bin we are trying to find
356         * @return the index of the bin containing the value
357         */
358        private int findBin(double value) {
359            return Math.min(
360                    Math.max((int) Math.ceil((value- min) / delta) - 1, 0),
361                    binCount - 1);
362            }
363    
364        /**
365         * Generates a random value from this distribution.
366         *
367         * @return the random value.
368         * @throws IllegalStateException if the distribution has not been loaded
369         */
370        public double getNextValue() throws IllegalStateException {
371    
372            if (!loaded) {
373                throw MathRuntimeException.createIllegalStateException("distribution not loaded");
374            }
375    
376            // Start with a uniformly distributed random number in (0,1)
377            double x = Math.random();
378    
379            // Use this to select the bin and generate a Gaussian within the bin
380            for (int i = 0; i < binCount; i++) {
381               if (x <= upperBounds[i]) {
382                   SummaryStatistics stats = binStats.get(i);
383                   if (stats.getN() > 0) {
384                       if (stats.getStandardDeviation() > 0) {  // more than one obs
385                            return randomData.nextGaussian
386                                (stats.getMean(),stats.getStandardDeviation());
387                       } else {
388                           return stats.getMean(); // only one obs in bin
389                       }
390                   }
391               }
392            }
393            throw new MathRuntimeException("no bin selected");
394        }
395    
396        /**
397         * Returns a {@link StatisticalSummary} describing this distribution.
398         * <strong>Preconditions:</strong><ul>
399         * <li>the distribution must be loaded before invoking this method</li></ul>
400         *
401         * @return the sample statistics
402         * @throws IllegalStateException if the distribution has not been loaded
403         */
404        public StatisticalSummary getSampleStats() {
405            return sampleStats;
406        }
407    
408        /**
409         * Returns the number of bins.
410         *
411         * @return the number of bins.
412         */
413        public int getBinCount() {
414            return binCount;
415        }
416    
417        /**
418         * Returns a List of {@link SummaryStatistics} instances containing
419         * statistics describing the values in each of the bins.  The list is
420         * indexed on the bin number.
421         *
422         * @return List of bin statistics.
423         */
424        public List<SummaryStatistics> getBinStats() {
425            return binStats;
426        }
427    
428        /**
429         * <p>Returns a fresh copy of the array of upper bounds for the bins.
430         * Bins are: <br/>
431         * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
432         *  (upperBounds[binCount-2], upperBounds[binCount-1] = max].</p>
433         *
434         * <p>Note: In versions 1.0-2.0 of commons-math, this method
435         * incorrectly returned the array of probability generator upper
436         * bounds now returned by {@link #getGeneratorUpperBounds()}.</p>
437         *
438         * @return array of bin upper bounds
439         * @since 2.1
440         */
441        public double[] getUpperBounds() {
442            double[] binUpperBounds = new double[binCount];
443            binUpperBounds[0] = min + delta;
444            for (int i = 1; i < binCount - 1; i++) {
445                binUpperBounds[i] = binUpperBounds[i-1] + delta;
446            }
447            binUpperBounds[binCount - 1] = max;
448            return binUpperBounds;
449        }
450    
451        /**
452         * <p>Returns a fresh copy of the array of upper bounds of the subintervals
453         * of [0,1] used in generating data from the empirical distribution.
454         * Subintervals correspond to bins with lengths proportional to bin counts.</p>
455         *
456         * <p>In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned
457         * by {@link #getUpperBounds()}.</p>
458         *
459         * @since 2.1
460         * @return array of upper bounds of subintervals used in data generation
461         */
462        public double[] getGeneratorUpperBounds() {
463            int len = upperBounds.length;
464            double[] out = new double[len];
465            System.arraycopy(upperBounds, 0, out, 0, len);
466            return out;
467        }
468    
469        /**
470         * Property indicating whether or not the distribution has been loaded.
471         *
472         * @return true if the distribution has been loaded
473         */
474        public boolean isLoaded() {
475            return loaded;
476        }
477    }