in datafusion/core/src/physical_plan/joins/utils.rs [1555:1724]
fn test_inner_join_cardinality_single_column() -> Result<()> {
let cases: Vec<(PartialStats, PartialStats, Option<usize>)> = vec![
// -----------------------------------------------------------------------------
// | left(rows, min, max, distinct), right(rows, min, max, distinct), expected |
// -----------------------------------------------------------------------------
// Cardinality computation
// =======================
//
// distinct(left) == NaN, distinct(right) == NaN
(
(10, Some(1), Some(10), None),
(10, Some(1), Some(10), None),
Some(10),
),
// range(left) > range(right)
(
(10, Some(6), Some(10), None),
(10, Some(8), Some(10), None),
Some(20),
),
// range(right) > range(left)
(
(10, Some(8), Some(10), None),
(10, Some(6), Some(10), None),
Some(20),
),
// range(left) > len(left), range(right) > len(right)
(
(10, Some(1), Some(15), None),
(20, Some(1), Some(40), None),
Some(10),
),
// When we have distinct count.
(
(10, Some(1), Some(10), Some(10)),
(10, Some(1), Some(10), Some(10)),
Some(10),
),
// distinct(left) > distinct(right)
(
(10, Some(1), Some(10), Some(5)),
(10, Some(1), Some(10), Some(2)),
Some(20),
),
// distinct(right) > distinct(left)
(
(10, Some(1), Some(10), Some(2)),
(10, Some(1), Some(10), Some(5)),
Some(20),
),
// min(left) < 0 (range(left) > range(right))
(
(10, Some(-5), Some(5), None),
(10, Some(1), Some(5), None),
Some(10),
),
// min(right) < 0, max(right) < 0 (range(right) > range(left))
(
(10, Some(-25), Some(-20), None),
(10, Some(-25), Some(-15), None),
Some(10),
),
// range(left) < 0, range(right) >= 0
// (there isn't a case where both left and right ranges are negative
// so one of them is always going to work, this just proves negative
// ranges with bigger absolute values are not are not accidentally used).
(
(10, Some(10), Some(0), None),
(10, Some(0), Some(10), Some(5)),
Some(20), // It would have been ten if we have used abs(range(left))
),
// range(left) = 1, range(right) = 1
(
(10, Some(1), Some(1), None),
(10, Some(1), Some(1), None),
Some(100),
),
//
// Edge cases
// ==========
//
// No column level stats.
((10, None, None, None), (10, None, None, None), None),
// No min or max (or both).
((10, None, None, Some(3)), (10, None, None, Some(3)), None),
(
(10, Some(2), None, Some(3)),
(10, None, Some(5), Some(3)),
None,
),
(
(10, None, Some(3), Some(3)),
(10, Some(1), None, Some(3)),
None,
),
((10, None, Some(3), None), (10, Some(1), None, None), None),
// Non overlapping min/max (when exact=False).
(
(10, Some(0), Some(10), None),
(10, Some(11), Some(20), None),
None,
),
(
(10, Some(11), Some(20), None),
(10, Some(0), Some(10), None),
None,
),
(
(10, Some(5), Some(10), Some(10)),
(10, Some(11), Some(3), Some(10)),
None,
),
(
(10, Some(10), Some(5), Some(10)),
(10, Some(3), Some(7), Some(10)),
None,
),
// distinct(left) = 0, distinct(right) = 0
(
(10, Some(1), Some(10), Some(0)),
(10, Some(1), Some(10), Some(0)),
None,
),
];
for (left_info, right_info, expected_cardinality) in cases {
let left_num_rows = left_info.0;
let left_col_stats =
vec![create_column_stats(left_info.1, left_info.2, left_info.3)];
let right_num_rows = right_info.0;
let right_col_stats = vec![create_column_stats(
right_info.1,
right_info.2,
right_info.3,
)];
assert_eq!(
estimate_inner_join_cardinality(
left_num_rows,
right_num_rows,
left_col_stats.clone(),
right_col_stats.clone(),
false,
),
expected_cardinality
);
// We should also be able to use join_cardinality to get the same results
let join_type = JoinType::Inner;
let join_on = vec![(Column::new("a", 0), Column::new("b", 0))];
let partial_join_stats = estimate_join_cardinality(
&join_type,
create_stats(Some(left_num_rows), Some(left_col_stats.clone()), false),
create_stats(Some(right_num_rows), Some(right_col_stats.clone()), false),
&join_on,
);
assert_eq!(
partial_join_stats.clone().map(|s| s.num_rows),
expected_cardinality
);
assert_eq!(
partial_join_stats.map(|s| s.column_statistics),
expected_cardinality.map(|_| [left_col_stats, right_col_stats].concat())
);
}
Ok(())
}