in datafusion/physical-plan/src/joins/utils.rs [1722:1919]
fn test_inner_join_cardinality_single_column() -> Result<()> {
let cases: Vec<(PartialStats, PartialStats, Option<Precision<usize>>)> = vec![
// ------------------------------------------------
// | left(rows, min, max, distinct, null_count), |
// | right(rows, min, max, distinct, null_count), |
// | expected, |
// ------------------------------------------------
// Cardinality computation
// =======================
//
// distinct(left) == NaN, distinct(right) == NaN
(
(10, Inexact(1), Inexact(10), Absent, Absent),
(10, Inexact(1), Inexact(10), Absent, Absent),
Some(Inexact(10)),
),
// range(left) > range(right)
(
(10, Inexact(6), Inexact(10), Absent, Absent),
(10, Inexact(8), Inexact(10), Absent, Absent),
Some(Inexact(20)),
),
// range(right) > range(left)
(
(10, Inexact(8), Inexact(10), Absent, Absent),
(10, Inexact(6), Inexact(10), Absent, Absent),
Some(Inexact(20)),
),
// range(left) > len(left), range(right) > len(right)
(
(10, Inexact(1), Inexact(15), Absent, Absent),
(20, Inexact(1), Inexact(40), Absent, Absent),
Some(Inexact(10)),
),
// When we have distinct count.
(
(10, Inexact(1), Inexact(10), Inexact(10), Absent),
(10, Inexact(1), Inexact(10), Inexact(10), Absent),
Some(Inexact(10)),
),
// distinct(left) > distinct(right)
(
(10, Inexact(1), Inexact(10), Inexact(5), Absent),
(10, Inexact(1), Inexact(10), Inexact(2), Absent),
Some(Inexact(20)),
),
// distinct(right) > distinct(left)
(
(10, Inexact(1), Inexact(10), Inexact(2), Absent),
(10, Inexact(1), Inexact(10), Inexact(5), Absent),
Some(Inexact(20)),
),
// min(left) < 0 (range(left) > range(right))
(
(10, Inexact(-5), Inexact(5), Absent, Absent),
(10, Inexact(1), Inexact(5), Absent, Absent),
Some(Inexact(10)),
),
// min(right) < 0, max(right) < 0 (range(right) > range(left))
(
(10, Inexact(-25), Inexact(-20), Absent, Absent),
(10, Inexact(-25), Inexact(-15), Absent, Absent),
Some(Inexact(10)),
),
// range(left) < 0, range(right) >= 0
// (there isn't a case where both left and right ranges are negative
// so one of them is always going to work, this just proves negative
// ranges with bigger absolute values are not are not accidentally used).
(
(10, Inexact(-10), Inexact(0), Absent, Absent),
(10, Inexact(0), Inexact(10), Inexact(5), Absent),
Some(Inexact(10)),
),
// range(left) = 1, range(right) = 1
(
(10, Inexact(1), Inexact(1), Absent, Absent),
(10, Inexact(1), Inexact(1), Absent, Absent),
Some(Inexact(100)),
),
//
// Edge cases
// ==========
//
// No column level stats.
(
(10, Absent, Absent, Absent, Absent),
(10, Absent, Absent, Absent, Absent),
None,
),
// No min or max (or both).
(
(10, Absent, Absent, Inexact(3), Absent),
(10, Absent, Absent, Inexact(3), Absent),
None,
),
(
(10, Inexact(2), Absent, Inexact(3), Absent),
(10, Absent, Inexact(5), Inexact(3), Absent),
None,
),
(
(10, Absent, Inexact(3), Inexact(3), Absent),
(10, Inexact(1), Absent, Inexact(3), Absent),
None,
),
(
(10, Absent, Inexact(3), Absent, Absent),
(10, Inexact(1), Absent, Absent, Absent),
None,
),
// Non overlapping min/max (when exact=False).
(
(10, Absent, Inexact(4), Absent, Absent),
(10, Inexact(5), Absent, Absent, Absent),
Some(Inexact(0)),
),
(
(10, Inexact(0), Inexact(10), Absent, Absent),
(10, Inexact(11), Inexact(20), Absent, Absent),
Some(Inexact(0)),
),
(
(10, Inexact(11), Inexact(20), Absent, Absent),
(10, Inexact(0), Inexact(10), Absent, Absent),
Some(Inexact(0)),
),
// distinct(left) = 0, distinct(right) = 0
(
(10, Inexact(1), Inexact(10), Inexact(0), Absent),
(10, Inexact(1), Inexact(10), Inexact(0), Absent),
None,
),
// Inexact row count < exact null count with absent distinct count
(
(0, Inexact(1), Inexact(10), Absent, Exact(5)),
(10, Inexact(1), Inexact(10), Absent, Absent),
Some(Inexact(0)),
),
];
for (left_info, right_info, expected_cardinality) in cases {
let left_num_rows = left_info.0;
let left_col_stats = vec![create_column_stats(
left_info.1,
left_info.2,
left_info.3,
left_info.4,
)];
let right_num_rows = right_info.0;
let right_col_stats = vec![create_column_stats(
right_info.1,
right_info.2,
right_info.3,
right_info.4,
)];
assert_eq!(
estimate_inner_join_cardinality(
Statistics {
num_rows: Inexact(left_num_rows),
total_byte_size: Absent,
column_statistics: left_col_stats.clone(),
},
Statistics {
num_rows: Inexact(right_num_rows),
total_byte_size: Absent,
column_statistics: right_col_stats.clone(),
},
),
expected_cardinality.clone()
);
// We should also be able to use join_cardinality to get the same results
let join_type = JoinType::Inner;
let join_on = vec![(
Arc::new(Column::new("a", 0)) as _,
Arc::new(Column::new("b", 0)) as _,
)];
let partial_join_stats = estimate_join_cardinality(
&join_type,
create_stats(Some(left_num_rows), left_col_stats.clone(), false),
create_stats(Some(right_num_rows), right_col_stats.clone(), false),
&join_on,
);
assert_eq!(
partial_join_stats.clone().map(|s| Inexact(s.num_rows)),
expected_cardinality.clone()
);
assert_eq!(
partial_join_stats.map(|s| s.column_statistics),
expected_cardinality.map(|_| [left_col_stats, right_col_stats].concat())
);
}
Ok(())
}