private void analyzeAggregation()

in fe/fe-core/src/main/java/org/apache/doris/analysis/SelectStmt.java [1296:1691]
267 lines of code
80 McCabe index (conditional complexity)

    private void analyzeAggregation(Analyzer analyzer) throws AnalysisException {
        // check having clause
        if (havingClause != null) {
            Expr ambiguousAlias = getFirstAmbiguousAlias(havingClause);
            if (ambiguousAlias != null) {
                ErrorReport.reportAnalysisException(ErrorCode.ERR_NON_UNIQ_ERROR, ambiguousAlias.toColumnLabel());
            }
            /*
             * The having clause need to be substitute by aliasSMap.
             * And it is analyzed after substitute.
             * For example:
             * Query: select k1 a, sum(k2) b from table group by k1 having a > 1;
             * Having clause: a > 1
             * aliasSMap: <a, table.k1> <b, sum(table.k2)>
             * After substitute: a > 1 changed to table.k1 > 1
             * Analyzer: check column and other subquery in having clause
             * having predicate: table.k1 > 1
             */
            /*
             * TODO(ml): support substitute outer column in correlated subquery
             * For example: select k1 key, sum(k1) sum_k1 from table a group by k1
             *              having k1 >
             *                     (select min(k1) from table b where a.key=b.k2);
             * TODO: the a.key should be replaced by a.k1 instead of unknown column 'key' in 'a'
             */

            /* according to mysql (https://dev.mysql.com/doc/refman/8.0/en/select.html)
             * "For GROUP BY or HAVING clauses, it searches the FROM clause before searching in the
             * select_expr values. (For GROUP BY and HAVING, this differs from the pre-MySQL 5.0 behavior
             * that used the same rules as for ORDER BY.)"
             * case1: having clause use column name table.v1, because it searches the FROM clause firstly
             *     select id, sum(v1) v1 from table group by id,v1 having(v1>1);
             * case2: having clause used in aggregate functions, such as sum(v2) here
             *     select id, sum(v1) v1, sum(v2) v2 from table group by id,v1 having(v1>1 AND sum(v2)>1);
             * case3: having clause use alias name v, because table do not have column name v
             *     select id, floor(v1) v, sum(v2) v2 from table group by id,v having(v>1 AND v2>1);
             * case4: having clause use alias name vsum, because table do not have column name vsum
             *     select id, floor(v1) v, sum(v2) vsum from table group by id,v having(v>1 AND vsum>1);
             */
            if (groupByClause != null) {
                boolean aliasFirst = false;
                if (analyzer.getContext() != null) {
                    aliasFirst = analyzer.getContext().getSessionVariable().isGroupByAndHavingUseAliasFirst();
                }
                if (!aliasFirst) {
                    ExprSubstitutionMap excludeAliasSMap = aliasSMap.clone();
                    List<Expr> havingSlots = Lists.newArrayList();
                    havingClause.collect(SlotRef.class, havingSlots);
                    for (Expr expr : havingSlots) {
                        if (excludeAliasSMap.get(expr) == null) {
                            continue;
                        }
                        try {
                            // try to use column name firstly
                            expr.clone().analyze(analyzer);
                            // analyze success means column name exist, do not use alias name
                            excludeAliasSMap.removeByLhsExpr(expr);
                        } catch (AnalysisException ex) {
                            // according to case3, column name do not exist, keep alias name inside alias map
                            if (ConnectContext.get() != null) {
                                ConnectContext.get().getState().reset();
                            }
                        }
                    }
                    havingClauseAfterAnalyzed = havingClause.substitute(excludeAliasSMap, analyzer, false);
                } else {
                    // If user set force using alias, then having clauses prefer using alias rather than column name
                    havingClauseAfterAnalyzed = havingClause.substitute(aliasSMap, analyzer, false);
                }
            } else {
                // according to mysql
                // if there is no group by clause, the having clause should use alias
                havingClauseAfterAnalyzed = havingClause.substitute(aliasSMap, analyzer, false);
            }
            havingClauseAfterAnalyzed = rewriteQueryExprByMvColumnExpr(havingClauseAfterAnalyzed, analyzer);
            if (!havingClauseAfterAnalyzed.getType().isBoolean()) {
                havingClauseAfterAnalyzed = havingClauseAfterAnalyzed.castTo(Type.BOOLEAN);
            }
            havingClauseAfterAnalyzed.checkReturnsBool("HAVING clause", true);
            if (groupingInfo != null) {
                groupingInfo.substituteGroupingFn(Arrays.asList(havingClauseAfterAnalyzed), analyzer);
            }
            // can't contain analytic exprs
            Expr analyticExpr = havingClauseAfterAnalyzed.findFirstOf(AnalyticExpr.class);
            if (analyticExpr != null) {
                throw new AnalysisException(
                        "HAVING clause must not contain analytic expressions: "
                                + analyticExpr.toSql());
            }
            if (isContainInBitmap(havingClauseAfterAnalyzed)) {
                throw new AnalysisException(
                        "HAVING clause dose not support in bitmap syntax: " + havingClauseAfterAnalyzed.toSql());
            }
        }

        if (groupByClause == null && !selectList.isDistinct()
                && !TreeNode.contains(resultExprs, Expr.isAggregatePredicate())
                && (havingClauseAfterAnalyzed == null || !havingClauseAfterAnalyzed.contains(
                        Expr.isAggregatePredicate()))
                && (sortInfo == null || !TreeNode.contains(sortInfo.getOrderingExprs(),
                Expr.isAggregatePredicate()))) {
            // We're not computing aggregates but we still need to register the HAVING
            // clause which could, e.g., contain a constant expression evaluating to false.
            if (havingClauseAfterAnalyzed != null) {
                if (havingClauseAfterAnalyzed.contains(Subquery.class)) {
                    throw new AnalysisException("Only constant expr could be supported in having clause "
                            + "when no aggregation in stmt");
                }
                analyzer.registerConjuncts(havingClauseAfterAnalyzed, true);
            }
            return;
        }

        // If we're computing an aggregate, we must have a FROM clause.
        if (fromClause.size() == 0) {
            throw new AnalysisException("Aggregation without a FROM clause is not allowed");
        }

        if (selectList.isDistinct() && groupByClause == null) {
            List<Expr> aggregateExpr = Lists.newArrayList();
            TreeNode.collect(resultExprs, Expr.isAggregatePredicate(), aggregateExpr);
            if (aggregateExpr.size() == resultExprs.size()) {
                selectList.setIsDistinct(false);
            }
        }

        if (selectList.isDistinct()
                && (groupByClause != null
                || TreeNode.contains(resultExprs, Expr.isAggregatePredicate())
                || (havingClauseAfterAnalyzed != null && havingClauseAfterAnalyzed.contains(
                        Expr.isAggregatePredicate())))) {
            throw new AnalysisException("cannot combine SELECT DISTINCT with aggregate functions or GROUP BY");
        }

        // disallow '*' and explicit GROUP BY (we can't group by '*', and if you need to
        // name all star-expanded cols in the group by clause you might as well do it
        // in the select list)
        if (groupByClause != null || TreeNode.contains(resultExprs, Expr.isAggregatePredicate())) {
            for (SelectListItem item : selectList.getItems()) {
                if (item.isStar()) {
                    throw new AnalysisException(
                            "cannot combine '*' in select list with GROUP BY: " + item.toSql());
                }
            }
        }

        // can't contain analytic exprs
        ArrayList<Expr> aggExprsForChecking = Lists.newArrayList();
        TreeNode.collect(resultExprs, Expr.isAggregatePredicate(), aggExprsForChecking);
        ArrayList<Expr> analyticExprs = Lists.newArrayList();
        TreeNode.collect(aggExprsForChecking, AnalyticExpr.class, analyticExprs);
        if (!analyticExprs.isEmpty()) {
            throw new AnalysisException(
                "AGGREGATE clause must not contain analytic expressions");
        }

        // Collect the aggregate expressions from the SELECT, HAVING and ORDER BY clauses
        // of this statement.
        ArrayList<FunctionCallExpr> aggExprs = Lists.newArrayList();
        TreeNode.collect(resultExprs, Expr.isAggregatePredicate(), aggExprs);
        if (havingClauseAfterAnalyzed != null) {
            havingClauseAfterAnalyzed.collect(Expr.isAggregatePredicate(), aggExprs);
        }
        if (sortInfo != null) {
            // TODO: Avoid evaluating aggs in ignored order-bys
            TreeNode.collect(sortInfo.getOrderingExprs(), Expr.isAggregatePredicate(), aggExprs);
        }

        // When DISTINCT aggregates are present, non-distinct (i.e. ALL) aggregates are
        // evaluated in two phases (see AggregateInfo for more details). In particular,
        // COUNT(c) in "SELECT COUNT(c), AGG(DISTINCT d) from R" is transformed to
        // "SELECT SUM(cnt) FROM (SELECT COUNT(c) as cnt from R group by d ) S".
        // Since a group-by expression is added to the inner query it returns no rows if
        // R is empty, in which case the SUM of COUNTs will return NULL.
        // However the original COUNT(c) should have returned 0 instead of NULL in this case.
        // Therefore, COUNT([ALL]) is transformed into zeroifnull(COUNT([ALL]) if
        // i) There is no GROUP-BY clause, and
        // ii) Other DISTINCT aggregates are present.
        ExprSubstitutionMap countAllMap = createCountAllMap(aggExprs, analyzer);
        final ExprSubstitutionMap multiDistinctAggMap =
                createMultiDistinctAggSMap(aggExprs, analyzer);
        countAllMap = ExprSubstitutionMap.compose(multiDistinctAggMap, countAllMap, analyzer);
        List<Expr> substitutedAggs =
                Expr.substituteList(aggExprs, countAllMap, analyzer, false);
        // the resultExprs and havingClause must substitute in the same way as aggExprs
        // then resultExprs and havingClause can be substitute correctly using combinedSmap
        resultExprs = Expr.substituteList(resultExprs, countAllMap, analyzer, false);
        if (havingClauseAfterAnalyzed != null) {
            havingClauseAfterAnalyzed =
                    havingClauseAfterAnalyzed.substitute(countAllMap, analyzer, false);
        }
        if (sortInfo != null) {
            // the ordering exprs must substitute in the same way as resultExprs
            sortInfo.substituteOrderingExprs(countAllMap, analyzer);
        }
        aggExprs.clear();
        TreeNode.collect(substitutedAggs, Expr.isAggregatePredicate(), aggExprs);

        List<TupleId> groupingByTupleIds = new ArrayList<>();
        if (groupByClause != null) {
            groupByClause.genGroupingExprs();
            ArrayList<Expr> groupingExprs = groupByClause.getGroupingExprs();
            if (groupingInfo != null) {
                groupingInfo.buildRepeat(groupingExprs, groupByClause.getGroupingSetList());
            }

            boolean aliasFirst = false;
            if (analyzer.getContext() != null) {
                aliasFirst = analyzer.getContext().getSessionVariable().isGroupByAndHavingUseAliasFirst();
            }
            substituteOrdinalsAliases(groupingExprs, "GROUP BY", analyzer, aliasFirst);
            // the groupingExprs must substitute in the same way as resultExprs
            groupingExprs = Expr.substituteList(groupingExprs, countAllMap, analyzer, false);

            if (!groupByClause.isGroupByExtension() && !groupingExprs.isEmpty()) {
                /*
                For performance reason, we want to remove constant column from groupingExprs.
                For example:
                `select sum(T.A) from T group by T.B, 'xyz'` is equivalent to `select sum(T.A) from T group by T.B`
                We can remove constant column `abc` from groupingExprs.

                But there is an exception when all groupingExpr are constant
                For example:
                sql1: `select 'abc' from t group by 'abc'`
                 is not equivalent to
                sql2: `select 'abc' from t`

                sql3: `select 'abc', sum(a) from t group by 'abc'`
                 is not equivalent to
                sql4: `select 1, sum(a) from t`
                (when t is empty, sql3 returns 0 tuple, sql4 return 1 tuple)

                We need to keep some constant columns if all groupingExpr are constant.
                Consider sql5 `select a from (select "abc" as a, 'def' as b) T group by b, a;`
                if the constant column is in select list, this column should not be removed.
                 */

                Expr theFirstConstantGroupingExpr = null;
                boolean someGroupExprRemoved = false;
                ArrayList<Expr> tempExprs = new ArrayList<>();
                for (Expr groupExpr : groupingExprs) {
                    //remove groupExpr if it is const, and it is not in select list
                    boolean removeConstGroupingKey = false;
                    if (groupExpr.isConstant() && !(groupExpr.contains(e -> e instanceof SlotRef))) {
                        if (theFirstConstantGroupingExpr == null) {
                            theFirstConstantGroupingExpr = groupExpr;
                        }
                        boolean keyInSelectList = false;
                        if (groupExpr instanceof SlotRef) {
                            for (SelectListItem item : selectList.getItems()) {
                                if (item.getExpr() instanceof SlotRef) {
                                    keyInSelectList = ((SlotRef) item.getExpr()).columnEqual(groupExpr);
                                    if (keyInSelectList) {
                                        break;
                                    }
                                }
                            }
                        }
                        removeConstGroupingKey = ! keyInSelectList;
                    }
                    if (removeConstGroupingKey) {
                        someGroupExprRemoved = true;
                    } else {
                        tempExprs.add(groupExpr);
                    }
                }
                if (someGroupExprRemoved) {
                    groupingExprs.clear();
                    groupingExprs.addAll(tempExprs);
                    //groupingExprs need at least one expr, it can be
                    //any original grouping expr. we use the first one.
                    if (groupingExprs.isEmpty()) {
                        groupingExprs.add(theFirstConstantGroupingExpr);
                    }
                }
            }

            for (int i = 0; i < groupingExprs.size(); i++) {
                groupingExprs.set(i, rewriteQueryExprByMvColumnExpr(groupingExprs.get(i), analyzer));
            }

            if (groupingInfo != null) {
                groupingInfo.genOutputTupleDescAndSMap(analyzer, groupingExprs, aggExprs);
                // must do it before copying for createAggInfo()
                groupingByTupleIds.add(groupingInfo.getOutputTupleDesc().getId());
            }
            groupByClause.analyze(analyzer);
            createAggInfo(groupingExprs, aggExprs, analyzer);
        } else {
            createAggInfo(new ArrayList<>(), aggExprs, analyzer);
        }
        // we remove all constant in group by expressions, when all exprs are constant
        // and no aggregate expr in select list, we do not generate aggInfo at all.
        if (aggInfo == null) {
            return;
        }

        // combine avg smap with the one that produces the final agg output
        AggregateInfo finalAggInfo =
                aggInfo.getSecondPhaseDistinctAggInfo() != null
                        ? aggInfo.getSecondPhaseDistinctAggInfo()
                        : aggInfo;
        groupingByTupleIds.add(finalAggInfo.getOutputTupleId());
        ExprSubstitutionMap combinedSmap = ExprSubstitutionMap.compose(
                countAllMap, finalAggInfo.getOutputSmap(), analyzer);
        // change select list, having and ordering exprs to point to agg output. We need
        // to reanalyze the exprs at this point.
        if (LOG.isDebugEnabled()) {
            LOG.debug("combined smap: " + combinedSmap.debugString());
            LOG.debug("desctbl: " + analyzer.getDescTbl().debugString());
            LOG.debug("resultexprs: " + Expr.debugString(resultExprs));
        }

        if (havingClauseAfterAnalyzed != null) {
            // forbidden correlated subquery in having clause
            List<Subquery> subqueryInHaving = Lists.newArrayList();
            havingClauseAfterAnalyzed.collect(Subquery.class, subqueryInHaving);
            for (Subquery subquery : subqueryInHaving) {
                if (subquery.isCorrelatedPredicate(getTableRefIds())) {
                    throw new AnalysisException("The correlated having clause is not supported");
                }
            }
        }

        /*
         * All of columns of result and having clause are replaced by new slot ref
         * which is bound by top tuple of agg info.
         * For example:
         * ResultExprs: SlotRef(k1), FunctionCall(sum(SlotRef(k2)))
         * Having predicate: FunctionCall(sum(SlotRef(k2))) > subquery
         * CombinedSMap: <SlotRef(k1) tuple 0, SlotRef(k1) of tuple 3>,
         *               <FunctionCall(SlotRef(k2)) tuple 0, SlotRef(sum(k2)) of tuple 3>
         *
         * After rewritten:
         * ResultExprs: SlotRef(k1) of tuple 3, SlotRef(sum(k2)) of tuple 3
         * Having predicate: SlotRef(sum(k2)) of tuple 3 > subquery
         */
        resultExprs = Expr.substituteList(resultExprs, combinedSmap, analyzer, false);
        if (LOG.isDebugEnabled()) {
            LOG.debug("post-agg selectListExprs: " + Expr.debugString(resultExprs));
        }
        if (havingClauseAfterAnalyzed != null) {
            havingPred = havingClauseAfterAnalyzed.substitute(combinedSmap, analyzer, false);
            analyzer.registerConjuncts(havingPred, true, finalAggInfo.getOutputTupleId().asList());
            if (LOG.isDebugEnabled()) {
                LOG.debug("post-agg havingPred: " + havingPred.debugString());
            }
        }

        if (sortInfo != null) {
            sortInfo.substituteOrderingExprs(combinedSmap, analyzer);
            if (LOG.isDebugEnabled()) {
                LOG.debug("post-agg orderingExprs: "
                        + Expr.debugString(sortInfo.getOrderingExprs()));
            }
        }

        // check that all post-agg exprs point to agg output
        for (int i = 0; i < selectList.getItems().size(); ++i) {
            if (!resultExprs.get(i).isBoundByTupleIds(groupingByTupleIds)) {
                if (CreateMaterializedViewStmt.isMVColumn(resultExprs.get(i).toSqlWithoutTbl())) {
                    List<TupleId> tupleIds = Lists.newArrayList();
                    List<SlotId> slotIds = Lists.newArrayList();
                    resultExprs.get(i).getIds(tupleIds, slotIds);
                    for (TupleId id : tupleIds) {
                        updateDisableTuplesMVRewriter(id);
                    }
                    throw new MVSelectFailedException("Materialized View rewrite invalid");
                } else {
                    throw new AnalysisException(
                            "select list expression not produced by aggregation output " + "(missing from "
                                    + "GROUP BY clause?): " + selectList.getItems().get(i).getExpr().toSql());
                }
            }
        }
        if (orderByElements != null) {
            for (int i = 0; i < orderByElements.size(); ++i) {
                if (!sortInfo.getOrderingExprs().get(i).isBoundByTupleIds(groupingByTupleIds)) {
                    throw new AnalysisException(
                            "ORDER BY expression not produced by aggregation output " + "(missing from "
                                    + "GROUP BY clause?): " + orderByElements.get(i).getExpr().toSql());
                }

                if (sortInfo.getOrderingExprs().get(i).type.isObjectStored()) {
                    throw new AnalysisException("ORDER BY expression could not contain object-stored columnx.");
                }
            }
        }
        if (havingPred != null) {
            if (!havingPred.isBoundByTupleIds(groupingByTupleIds)) {
                throw new AnalysisException(
                        "HAVING clause not produced by aggregation output " + "(missing from GROUP BY "
                                + "clause?): " + havingClause.toSql());
            }
        }
    }