fn test_inner_join_cardinality_single_column()

in datafusion/physical-plan/src/joins/utils.rs [1722:1919]


    fn test_inner_join_cardinality_single_column() -> Result<()> {
        let cases: Vec<(PartialStats, PartialStats, Option<Precision<usize>>)> = vec![
            // ------------------------------------------------
            // | left(rows, min, max, distinct, null_count),  |
            // | right(rows, min, max, distinct, null_count), |
            // | expected,                                    |
            // ------------------------------------------------

            // Cardinality computation
            // =======================
            //
            // distinct(left) == NaN, distinct(right) == NaN
            (
                (10, Inexact(1), Inexact(10), Absent, Absent),
                (10, Inexact(1), Inexact(10), Absent, Absent),
                Some(Inexact(10)),
            ),
            // range(left) > range(right)
            (
                (10, Inexact(6), Inexact(10), Absent, Absent),
                (10, Inexact(8), Inexact(10), Absent, Absent),
                Some(Inexact(20)),
            ),
            // range(right) > range(left)
            (
                (10, Inexact(8), Inexact(10), Absent, Absent),
                (10, Inexact(6), Inexact(10), Absent, Absent),
                Some(Inexact(20)),
            ),
            // range(left) > len(left), range(right) > len(right)
            (
                (10, Inexact(1), Inexact(15), Absent, Absent),
                (20, Inexact(1), Inexact(40), Absent, Absent),
                Some(Inexact(10)),
            ),
            // When we have distinct count.
            (
                (10, Inexact(1), Inexact(10), Inexact(10), Absent),
                (10, Inexact(1), Inexact(10), Inexact(10), Absent),
                Some(Inexact(10)),
            ),
            // distinct(left) > distinct(right)
            (
                (10, Inexact(1), Inexact(10), Inexact(5), Absent),
                (10, Inexact(1), Inexact(10), Inexact(2), Absent),
                Some(Inexact(20)),
            ),
            // distinct(right) > distinct(left)
            (
                (10, Inexact(1), Inexact(10), Inexact(2), Absent),
                (10, Inexact(1), Inexact(10), Inexact(5), Absent),
                Some(Inexact(20)),
            ),
            // min(left) < 0 (range(left) > range(right))
            (
                (10, Inexact(-5), Inexact(5), Absent, Absent),
                (10, Inexact(1), Inexact(5), Absent, Absent),
                Some(Inexact(10)),
            ),
            // min(right) < 0, max(right) < 0 (range(right) > range(left))
            (
                (10, Inexact(-25), Inexact(-20), Absent, Absent),
                (10, Inexact(-25), Inexact(-15), Absent, Absent),
                Some(Inexact(10)),
            ),
            // range(left) < 0, range(right) >= 0
            // (there isn't a case where both left and right ranges are negative
            //  so one of them is always going to work, this just proves negative
            //  ranges with bigger absolute values are not are not accidentally used).
            (
                (10, Inexact(-10), Inexact(0), Absent, Absent),
                (10, Inexact(0), Inexact(10), Inexact(5), Absent),
                Some(Inexact(10)),
            ),
            // range(left) = 1, range(right) = 1
            (
                (10, Inexact(1), Inexact(1), Absent, Absent),
                (10, Inexact(1), Inexact(1), Absent, Absent),
                Some(Inexact(100)),
            ),
            //
            // Edge cases
            // ==========
            //
            // No column level stats.
            (
                (10, Absent, Absent, Absent, Absent),
                (10, Absent, Absent, Absent, Absent),
                None,
            ),
            // No min or max (or both).
            (
                (10, Absent, Absent, Inexact(3), Absent),
                (10, Absent, Absent, Inexact(3), Absent),
                None,
            ),
            (
                (10, Inexact(2), Absent, Inexact(3), Absent),
                (10, Absent, Inexact(5), Inexact(3), Absent),
                None,
            ),
            (
                (10, Absent, Inexact(3), Inexact(3), Absent),
                (10, Inexact(1), Absent, Inexact(3), Absent),
                None,
            ),
            (
                (10, Absent, Inexact(3), Absent, Absent),
                (10, Inexact(1), Absent, Absent, Absent),
                None,
            ),
            // Non overlapping min/max (when exact=False).
            (
                (10, Absent, Inexact(4), Absent, Absent),
                (10, Inexact(5), Absent, Absent, Absent),
                Some(Inexact(0)),
            ),
            (
                (10, Inexact(0), Inexact(10), Absent, Absent),
                (10, Inexact(11), Inexact(20), Absent, Absent),
                Some(Inexact(0)),
            ),
            (
                (10, Inexact(11), Inexact(20), Absent, Absent),
                (10, Inexact(0), Inexact(10), Absent, Absent),
                Some(Inexact(0)),
            ),
            // distinct(left) = 0, distinct(right) = 0
            (
                (10, Inexact(1), Inexact(10), Inexact(0), Absent),
                (10, Inexact(1), Inexact(10), Inexact(0), Absent),
                None,
            ),
            // Inexact row count < exact null count with absent distinct count
            (
                (0, Inexact(1), Inexact(10), Absent, Exact(5)),
                (10, Inexact(1), Inexact(10), Absent, Absent),
                Some(Inexact(0)),
            ),
        ];

        for (left_info, right_info, expected_cardinality) in cases {
            let left_num_rows = left_info.0;
            let left_col_stats = vec![create_column_stats(
                left_info.1,
                left_info.2,
                left_info.3,
                left_info.4,
            )];

            let right_num_rows = right_info.0;
            let right_col_stats = vec![create_column_stats(
                right_info.1,
                right_info.2,
                right_info.3,
                right_info.4,
            )];

            assert_eq!(
                estimate_inner_join_cardinality(
                    Statistics {
                        num_rows: Inexact(left_num_rows),
                        total_byte_size: Absent,
                        column_statistics: left_col_stats.clone(),
                    },
                    Statistics {
                        num_rows: Inexact(right_num_rows),
                        total_byte_size: Absent,
                        column_statistics: right_col_stats.clone(),
                    },
                ),
                expected_cardinality.clone()
            );

            // We should also be able to use join_cardinality to get the same results
            let join_type = JoinType::Inner;
            let join_on = vec![(
                Arc::new(Column::new("a", 0)) as _,
                Arc::new(Column::new("b", 0)) as _,
            )];
            let partial_join_stats = estimate_join_cardinality(
                &join_type,
                create_stats(Some(left_num_rows), left_col_stats.clone(), false),
                create_stats(Some(right_num_rows), right_col_stats.clone(), false),
                &join_on,
            );

            assert_eq!(
                partial_join_stats.clone().map(|s| Inexact(s.num_rows)),
                expected_cardinality.clone()
            );
            assert_eq!(
                partial_join_stats.map(|s| s.column_statistics),
                expected_cardinality.map(|_| [left_col_stats, right_col_stats].concat())
            );
        }
        Ok(())
    }