Thanks to @Emil_Wozniak for posting the complete code. I struggled with it for a while not realizing that eliminateOutliers()
actually returns the outliers, not the list with them eliminated. The isOutOfBounds()
method also was confusing because it actually returns TRUE when the value is IN bounds. Below is my update with some (IMHO) improvements:
- The eliminateOutliers() method returns the input list with outliers removed
- Added getOutliers() method to get just the list of outliers
- Removed confusing isOutOfBounds() method in favor of a simple filtering expression
- Expanded N list to support up to 30 input values
- Protect against out of bounds errors when input list is too big or too small
- Made stats methods (mean, stddev, variance) static utility methods
- Calculate upper/lower bounds only once instead of on every comparison
- Supply input list on ctor and store as an instance variable
- Refactor to avoid using the same variable name as instance and local variables
Code:
/**
* Implements an outlier removal algorithm based on https://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/dixon.htm#:~:text=It%20can%20be%20used%20to,but%20one%20or%20two%20observations).
* Original Java code by Emil Wozniak at https://stackoverflow.com/questions/18805178/how-to-detect-outliers-in-an-arraylist
*
* Reorganized, made more robust, and clarified many of the methods.
*/
import java.util.List;
import java.util.stream.Collectors;
public class DixonTest {
protected List<Double> criticalValues =
List.of( // Taken from https://sebastianraschka.com/Articles/2014_dixon_test.html#2-calculate-q
// Alfa level of 0.1 (90% confidence)
0.941, // N=3
0.765, // N=4
0.642, // ...
0.56,
0.507,
0.468,
0.437,
0.412,
0.392,
0.376,
0.361,
0.349,
0.338,
0.329,
0.32,
0.313,
0.306,
0.3,
0.295,
0.29,
0.285,
0.281,
0.277,
0.273,
0.269,
0.266,
0.263,
0.26 // N=30
);
// Stats calculated on original input data (including outliers)
private double scaleOfElimination;
private double mean;
private double stdDev;
private double UB;
private double LB;
private List<Double> input;
/**
* Ctor taking a list of values to be analyzed.
* @param input
*/
public DixonTest(List<Double> input) {
this.input = input;
// Create statistics on the original input data
calcStats();
}
/**
* Utility method returns the mean of a list of values.
* @param valueList
* @return
*/
public static double getMean(final List<Double> valueList) {
double sum = valueList.stream()
.mapToDouble(value -> value)
.sum();
return (sum / valueList.size());
}
/**
* Utility method returns the variance of a list of values.
* @param valueList
* @return
*/
public static double getVariance(List<Double> valueList) {
double listMean = getMean(valueList);
double temp = valueList.stream()
.mapToDouble(a -> a)
.map(a -> (a - listMean) * (a - listMean))
.sum();
return temp / (valueList.size() - 1);
}
/**
* Utility method returns the std deviation of a list of values.
* @param input
* @return
*/
public static double getStdDev(List<Double> valueList) {
return Math.sqrt(getVariance(valueList));
}
/**
* Calculate statistics and bounds from the input values and store
* them in class variables.
* @param input
*/
private void calcStats() {
int N = Math.min(Math.max(0, input.size() - 3), criticalValues.size()-1); // Changed to protect against too-small or too-large lists
scaleOfElimination = criticalValues.get(N).floatValue();
mean = getMean(input);
stdDev = getStdDev(input);
UB = mean + stdDev * scaleOfElimination;
LB = mean - stdDev * scaleOfElimination;
}
/**
* Returns the input values with outliers removed.
* @param input
* @return
*/
public List<Double> eliminateOutliers() {
return input.stream()
.filter(value -> value>=LB && value <=UB)
.collect(Collectors.toList());
}
/**
* Returns the outliers found in the input list.
* @param input
* @return
*/
public List<Double> getOutliers() {
return input.stream()
.filter(value -> value<LB || value>UB)
.collect(Collectors.toList());
}
/**
* Test and sample usage
* @param args
*/
public static void main(String[] args) {
List<Double> testValues = List.of(1200.0,1205.0,1220.0,1194.0,1212.0);
DixonTest outlierDetector = new DixonTest(testValues);
List<Double> goodValues = outlierDetector.eliminateOutliers();
List<Double> badValues = outlierDetector.getOutliers();
System.out.println(goodValues.size()+ " good values:");
for (double v: goodValues) {
System.out.println(v);
}
System.out.println(badValues.size()+" outliers detected:");
for (double v: badValues) {
System.out.println(v);
}
// Get stats on remaining (good) values
System.out.println("\nMean of good values is "+DixonTest.getMean(goodValues));
}
}