001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.math.random; 019 020 import java.io.BufferedReader; 021 import java.io.File; 022 import java.io.FileReader; 023 import java.io.IOException; 024 import java.io.InputStreamReader; 025 import java.io.Serializable; 026 import java.net.URL; 027 import java.util.ArrayList; 028 import java.util.List; 029 030 import org.apache.commons.math.MathRuntimeException; 031 import org.apache.commons.math.stat.descriptive.StatisticalSummary; 032 import org.apache.commons.math.stat.descriptive.SummaryStatistics; 033 034 /** 035 * Implements <code>EmpiricalDistribution</code> interface. This implementation 036 * uses what amounts to the 037 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html"> 038 * Variable Kernel Method</a> with Gaussian smoothing:<p> 039 * <strong>Digesting the input file</strong> 040 * <ol><li>Pass the file once to compute min and max.</li> 041 * <li>Divide the range from min-max into <code>binCount</code> "bins."</li> 042 * <li>Pass the data file again, computing bin counts and univariate 043 * statistics (mean, std dev.) for each of the bins </li> 044 * <li>Divide the interval (0,1) into subintervals associated with the bins, 045 * with the length of a bin's subinterval proportional to its count.</li></ol> 046 * <strong>Generating random values from the distribution</strong><ol> 047 * <li>Generate a uniformly distributed value in (0,1) </li> 048 * <li>Select the subinterval to which the value belongs. 049 * <li>Generate a random Gaussian value with mean = mean of the associated 050 * bin and std dev = std dev of associated bin.</li></ol></p><p> 051 *<strong>USAGE NOTES:</strong><ul> 052 *<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb 053 * is to set the bin count to approximately the length of the input file divided 054 * by 10. </li> 055 *<li>The input file <i>must</i> be a plain text file containing one valid numeric 056 * entry per line.</li> 057 * </ul></p> 058 * 059 * @version $Revision: 925812 $ $Date: 2010-03-21 11:49:31 -0400 (Sun, 21 Mar 2010) $ 060 */ 061 public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution { 062 063 /** Serializable version identifier */ 064 private static final long serialVersionUID = 5729073523949762654L; 065 066 /** List of SummaryStatistics objects characterizing the bins */ 067 private List<SummaryStatistics> binStats = null; 068 069 /** Sample statistics */ 070 private SummaryStatistics sampleStats = null; 071 072 /** Max loaded value */ 073 private double max = Double.NEGATIVE_INFINITY; 074 075 /** Min loaded value */ 076 private double min = Double.POSITIVE_INFINITY; 077 078 /** Grid size */ 079 private double delta = 0d; 080 081 /** number of bins */ 082 private int binCount = 1000; 083 084 /** is the distribution loaded? */ 085 private boolean loaded = false; 086 087 /** upper bounds of subintervals in (0,1) "belonging" to the bins */ 088 private double[] upperBounds = null; 089 090 /** RandomData instance to use in repeated calls to getNext() */ 091 private RandomData randomData = new RandomDataImpl(); 092 093 /** 094 * Creates a new EmpiricalDistribution with the default bin count. 095 */ 096 public EmpiricalDistributionImpl() { 097 binStats = new ArrayList<SummaryStatistics>(); 098 } 099 100 /** 101 * Creates a new EmpiricalDistribution with the specified bin count. 102 * 103 * @param binCount number of bins 104 */ 105 public EmpiricalDistributionImpl(int binCount) { 106 this.binCount = binCount; 107 binStats = new ArrayList<SummaryStatistics>(); 108 } 109 110 /** 111 * Computes the empirical distribution from the provided 112 * array of numbers. 113 * 114 * @param in the input data array 115 */ 116 public void load(double[] in) { 117 DataAdapter da = new ArrayDataAdapter(in); 118 try { 119 da.computeStats(); 120 fillBinStats(in); 121 } catch (IOException e) { 122 throw new MathRuntimeException(e); 123 } 124 loaded = true; 125 126 } 127 128 /** 129 * Computes the empirical distribution using data read from a URL. 130 * @param url url of the input file 131 * 132 * @throws IOException if an IO error occurs 133 */ 134 public void load(URL url) throws IOException { 135 BufferedReader in = 136 new BufferedReader(new InputStreamReader(url.openStream())); 137 try { 138 DataAdapter da = new StreamDataAdapter(in); 139 da.computeStats(); 140 if (sampleStats.getN() == 0) { 141 throw MathRuntimeException.createEOFException("URL {0} contains no data", 142 url); 143 } 144 in = new BufferedReader(new InputStreamReader(url.openStream())); 145 fillBinStats(in); 146 loaded = true; 147 } finally { 148 try { 149 in.close(); 150 } catch (IOException ex) { 151 // ignore 152 } 153 } 154 } 155 156 /** 157 * Computes the empirical distribution from the input file. 158 * 159 * @param file the input file 160 * @throws IOException if an IO error occurs 161 */ 162 public void load(File file) throws IOException { 163 BufferedReader in = new BufferedReader(new FileReader(file)); 164 try { 165 DataAdapter da = new StreamDataAdapter(in); 166 da.computeStats(); 167 in = new BufferedReader(new FileReader(file)); 168 fillBinStats(in); 169 loaded = true; 170 } finally { 171 try { 172 in.close(); 173 } catch (IOException ex) { 174 // ignore 175 } 176 } 177 } 178 179 /** 180 * Provides methods for computing <code>sampleStats</code> and 181 * <code>beanStats</code> abstracting the source of data. 182 */ 183 private abstract class DataAdapter{ 184 185 /** 186 * Compute bin stats. 187 * 188 * @throws IOException if an error occurs computing bin stats 189 */ 190 public abstract void computeBinStats() throws IOException; 191 192 /** 193 * Compute sample statistics. 194 * 195 * @throws IOException if an error occurs computing sample stats 196 */ 197 public abstract void computeStats() throws IOException; 198 199 } 200 201 /** 202 * Factory of <code>DataAdapter</code> objects. For every supported source 203 * of data (array of doubles, file, etc.) an instance of the proper object 204 * is returned. 205 */ 206 private class DataAdapterFactory{ 207 /** 208 * Creates a DataAdapter from a data object 209 * 210 * @param in object providing access to the data 211 * @return DataAdapter instance 212 */ 213 public DataAdapter getAdapter(Object in) { 214 if (in instanceof BufferedReader) { 215 BufferedReader inputStream = (BufferedReader) in; 216 return new StreamDataAdapter(inputStream); 217 } else if (in instanceof double[]) { 218 double[] inputArray = (double[]) in; 219 return new ArrayDataAdapter(inputArray); 220 } else { 221 throw MathRuntimeException.createIllegalArgumentException( 222 "input data comes from unsupported datasource: {0}, " + 223 "supported sources: {1}, {2}", 224 in.getClass().getName(), 225 BufferedReader.class.getName(), double[].class.getName()); 226 } 227 } 228 } 229 /** 230 * <code>DataAdapter</code> for data provided through some input stream 231 */ 232 private class StreamDataAdapter extends DataAdapter{ 233 234 /** Input stream providing access to the data */ 235 private BufferedReader inputStream; 236 237 /** 238 * Create a StreamDataAdapter from a BufferedReader 239 * 240 * @param in BufferedReader input stream 241 */ 242 public StreamDataAdapter(BufferedReader in){ 243 super(); 244 inputStream = in; 245 } 246 247 /** {@inheritDoc} */ 248 @Override 249 public void computeBinStats() throws IOException { 250 String str = null; 251 double val = 0.0d; 252 while ((str = inputStream.readLine()) != null) { 253 val = Double.parseDouble(str); 254 SummaryStatistics stats = binStats.get(findBin(val)); 255 stats.addValue(val); 256 } 257 258 inputStream.close(); 259 inputStream = null; 260 } 261 262 /** {@inheritDoc} */ 263 @Override 264 public void computeStats() throws IOException { 265 String str = null; 266 double val = 0.0; 267 sampleStats = new SummaryStatistics(); 268 while ((str = inputStream.readLine()) != null) { 269 val = Double.valueOf(str).doubleValue(); 270 sampleStats.addValue(val); 271 } 272 inputStream.close(); 273 inputStream = null; 274 } 275 } 276 277 /** 278 * <code>DataAdapter</code> for data provided as array of doubles. 279 */ 280 private class ArrayDataAdapter extends DataAdapter { 281 282 /** Array of input data values */ 283 private double[] inputArray; 284 285 /** 286 * Construct an ArrayDataAdapter from a double[] array 287 * 288 * @param in double[] array holding the data 289 */ 290 public ArrayDataAdapter(double[] in){ 291 super(); 292 inputArray = in; 293 } 294 295 /** {@inheritDoc} */ 296 @Override 297 public void computeStats() throws IOException { 298 sampleStats = new SummaryStatistics(); 299 for (int i = 0; i < inputArray.length; i++) { 300 sampleStats.addValue(inputArray[i]); 301 } 302 } 303 304 /** {@inheritDoc} */ 305 @Override 306 public void computeBinStats() throws IOException { 307 for (int i = 0; i < inputArray.length; i++) { 308 SummaryStatistics stats = 309 binStats.get(findBin(inputArray[i])); 310 stats.addValue(inputArray[i]); 311 } 312 } 313 } 314 315 /** 316 * Fills binStats array (second pass through data file). 317 * 318 * @param in object providing access to the data 319 * @throws IOException if an IO error occurs 320 */ 321 private void fillBinStats(Object in) throws IOException { 322 // Set up grid 323 min = sampleStats.getMin(); 324 max = sampleStats.getMax(); 325 delta = (max - min)/(Double.valueOf(binCount)).doubleValue(); 326 327 // Initialize binStats ArrayList 328 if (!binStats.isEmpty()) { 329 binStats.clear(); 330 } 331 for (int i = 0; i < binCount; i++) { 332 SummaryStatistics stats = new SummaryStatistics(); 333 binStats.add(i,stats); 334 } 335 336 // Filling data in binStats Array 337 DataAdapterFactory aFactory = new DataAdapterFactory(); 338 DataAdapter da = aFactory.getAdapter(in); 339 da.computeBinStats(); 340 341 // Assign upperBounds based on bin counts 342 upperBounds = new double[binCount]; 343 upperBounds[0] = 344 ((double) binStats.get(0).getN()) / (double) sampleStats.getN(); 345 for (int i = 1; i < binCount-1; i++) { 346 upperBounds[i] = upperBounds[i-1] + 347 ((double) binStats.get(i).getN()) / (double) sampleStats.getN(); 348 } 349 upperBounds[binCount-1] = 1.0d; 350 } 351 352 /** 353 * Returns the index of the bin to which the given value belongs 354 * 355 * @param value the value whose bin we are trying to find 356 * @return the index of the bin containing the value 357 */ 358 private int findBin(double value) { 359 return Math.min( 360 Math.max((int) Math.ceil((value- min) / delta) - 1, 0), 361 binCount - 1); 362 } 363 364 /** 365 * Generates a random value from this distribution. 366 * 367 * @return the random value. 368 * @throws IllegalStateException if the distribution has not been loaded 369 */ 370 public double getNextValue() throws IllegalStateException { 371 372 if (!loaded) { 373 throw MathRuntimeException.createIllegalStateException("distribution not loaded"); 374 } 375 376 // Start with a uniformly distributed random number in (0,1) 377 double x = Math.random(); 378 379 // Use this to select the bin and generate a Gaussian within the bin 380 for (int i = 0; i < binCount; i++) { 381 if (x <= upperBounds[i]) { 382 SummaryStatistics stats = binStats.get(i); 383 if (stats.getN() > 0) { 384 if (stats.getStandardDeviation() > 0) { // more than one obs 385 return randomData.nextGaussian 386 (stats.getMean(),stats.getStandardDeviation()); 387 } else { 388 return stats.getMean(); // only one obs in bin 389 } 390 } 391 } 392 } 393 throw new MathRuntimeException("no bin selected"); 394 } 395 396 /** 397 * Returns a {@link StatisticalSummary} describing this distribution. 398 * <strong>Preconditions:</strong><ul> 399 * <li>the distribution must be loaded before invoking this method</li></ul> 400 * 401 * @return the sample statistics 402 * @throws IllegalStateException if the distribution has not been loaded 403 */ 404 public StatisticalSummary getSampleStats() { 405 return sampleStats; 406 } 407 408 /** 409 * Returns the number of bins. 410 * 411 * @return the number of bins. 412 */ 413 public int getBinCount() { 414 return binCount; 415 } 416 417 /** 418 * Returns a List of {@link SummaryStatistics} instances containing 419 * statistics describing the values in each of the bins. The list is 420 * indexed on the bin number. 421 * 422 * @return List of bin statistics. 423 */ 424 public List<SummaryStatistics> getBinStats() { 425 return binStats; 426 } 427 428 /** 429 * <p>Returns a fresh copy of the array of upper bounds for the bins. 430 * Bins are: <br/> 431 * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],..., 432 * (upperBounds[binCount-2], upperBounds[binCount-1] = max].</p> 433 * 434 * <p>Note: In versions 1.0-2.0 of commons-math, this method 435 * incorrectly returned the array of probability generator upper 436 * bounds now returned by {@link #getGeneratorUpperBounds()}.</p> 437 * 438 * @return array of bin upper bounds 439 * @since 2.1 440 */ 441 public double[] getUpperBounds() { 442 double[] binUpperBounds = new double[binCount]; 443 binUpperBounds[0] = min + delta; 444 for (int i = 1; i < binCount - 1; i++) { 445 binUpperBounds[i] = binUpperBounds[i-1] + delta; 446 } 447 binUpperBounds[binCount - 1] = max; 448 return binUpperBounds; 449 } 450 451 /** 452 * <p>Returns a fresh copy of the array of upper bounds of the subintervals 453 * of [0,1] used in generating data from the empirical distribution. 454 * Subintervals correspond to bins with lengths proportional to bin counts.</p> 455 * 456 * <p>In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned 457 * by {@link #getUpperBounds()}.</p> 458 * 459 * @since 2.1 460 * @return array of upper bounds of subintervals used in data generation 461 */ 462 public double[] getGeneratorUpperBounds() { 463 int len = upperBounds.length; 464 double[] out = new double[len]; 465 System.arraycopy(upperBounds, 0, out, 0, len); 466 return out; 467 } 468 469 /** 470 * Property indicating whether or not the distribution has been loaded. 471 * 472 * @return true if the distribution has been loaded 473 */ 474 public boolean isLoaded() { 475 return loaded; 476 } 477 }