in src/main/java/org/apache/datasketches/hive/tuple/ArrayOfDoublesSketchesTTestUDF.java [50:75]
public List<Double> evaluate(final BytesWritable serializedSketchA, final BytesWritable serializedSketchB) {
if (serializedSketchA == null || serializedSketchB == null) { return null; }
final ArrayOfDoublesSketch sketchA =
ArrayOfDoublesSketches.wrapSketch(BytesWritableHelper.wrapAsMemory(serializedSketchA));
final ArrayOfDoublesSketch sketchB =
ArrayOfDoublesSketches.wrapSketch(BytesWritableHelper.wrapAsMemory(serializedSketchB));
if (sketchA.getNumValues() != sketchB.getNumValues()) {
throw new IllegalArgumentException("Both sketches must have the same number of values");
}
// If the sketches contain fewer than 2 values, the p-value can't be calculated
if (sketchA.getRetainedEntries() < 2 || sketchB.getRetainedEntries() < 2) {
return null;
}
final SummaryStatistics[] summariesA = ArrayOfDoublesSketchStats.sketchToSummaryStatistics(sketchA);
final SummaryStatistics[] summariesB = ArrayOfDoublesSketchStats.sketchToSummaryStatistics(sketchB);
final TTest tTest = new TTest();
final List<Double> pValues = new ArrayList<>(sketchA.getNumValues());
for (int i = 0; i < sketchA.getNumValues(); i++) {
pValues.add(tTest.tTest(summariesA[i], summariesB[i]));
}
return pValues;
}