cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc (1,086 lines of code) (raw):

/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "SubstraitToVeloxPlanValidator.h" #include <google/protobuf/wrappers.pb.h> #include <re2/re2.h> #include <string> #include "TypeUtils.h" #include "utils/Common.h" #include "velox/core/ExpressionEvaluator.h" #include "velox/exec/Aggregate.h" #include "velox/expression/Expr.h" #include "velox/expression/SignatureBinder.h" namespace gluten { namespace { static const char* extractFileName(const char* file) { return strrchr(file, '/') ? strrchr(file, '/') + 1 : file; } #define LOG_VALIDATION_MSG_FROM_EXCEPTION(err) \ logValidateMsg(fmt::format( \ "Validation failed due to exception caught at file:{} line:{} function:{}, thrown from file:{} line:{} function:{}, reason:{}", \ extractFileName(__FILE__), \ __LINE__, \ __FUNCTION__, \ extractFileName(err.file()), \ err.line(), \ err.function(), \ err.message())) #define LOG_VALIDATION_MSG(reason) \ logValidateMsg(fmt::format( \ "Validation failed at file:{}, line:{}, function:{}, reason:{}", \ extractFileName(__FILE__), \ __LINE__, \ __FUNCTION__, \ reason)) static const std::unordered_set<std::string> kRegexFunctions = { "regexp_extract", "regexp_extract_all", "regexp_replace", "rlike"}; static const std::unordered_set<std::string> kBlackList = { "split_part", "factorial", "concat_ws", "from_json", "json_array_length", "repeat", "trunc", "sequence", "posexplode", "arrays_overlap", "approx_percentile", "get_array_struct_fields"}; } // namespace bool SubstraitToVeloxPlanValidator::validateInputTypes( const ::substrait::extensions::AdvancedExtension& extension, std::vector<TypePtr>& types) { // The input type is wrapped in enhancement. if (!extension.has_enhancement()) { LOG_VALIDATION_MSG("Input type is not wrapped in enhancement."); return false; } const auto& enhancement = extension.enhancement(); ::substrait::Type inputType; if (!enhancement.UnpackTo(&inputType)) { LOG_VALIDATION_MSG("Enhancement can't be unpacked to inputType."); return false; } if (!inputType.has_struct_()) { LOG_VALIDATION_MSG("Input type has no struct."); return false; } // Get the input types. const auto& sTypes = inputType.struct_().types(); for (const auto& sType : sTypes) { try { types.emplace_back(SubstraitParser::parseType(sType)); } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } } return true; } bool SubstraitToVeloxPlanValidator::validateRound( const ::substrait::Expression::ScalarFunction& scalarFunction, const RowTypePtr& inputType) { const auto& arguments = scalarFunction.arguments(); if (arguments.size() < 2) { return false; } if (!arguments[1].value().has_literal()) { LOG_VALIDATION_MSG("Round scale is expected."); return false; } // Velox has different result with Spark on negative scale. auto typeCase = arguments[1].value().literal().literal_type_case(); switch (typeCase) { case ::substrait::Expression_Literal::LiteralTypeCase::kI32: return (arguments[1].value().literal().i32() >= 0); case ::substrait::Expression_Literal::LiteralTypeCase::kI64: return (arguments[1].value().literal().i64() >= 0); default: LOG_VALIDATION_MSG("Round scale validation is not supported for type case " + std::to_string(typeCase)); return false; } } bool SubstraitToVeloxPlanValidator::validateExtractExpr(const std::vector<core::TypedExprPtr>& params) { if (params.size() != 2) { LOG_VALIDATION_MSG("Value expected in variant in ExtractExpr."); return false; } auto functionArg = std::dynamic_pointer_cast<const core::ConstantTypedExpr>(params[0]); if (functionArg) { // Get the function argument. const auto& variant = functionArg->value(); if (!variant.hasValue()) { LOG_VALIDATION_MSG("Value expected in variant in ExtractExpr."); return false; } // The first parameter specifies extracting from which field. const auto& from = variant.value<std::string>(); // Hour causes incorrect result. if (from == "HOUR") { LOG_VALIDATION_MSG("Extract from hour."); return false; } return true; } LOG_VALIDATION_MSG("Constant is expected to be the first parameter in extract."); return false; } bool SubstraitToVeloxPlanValidator::validateRegexExpr( const std::string& name, const ::substrait::Expression::ScalarFunction& scalarFunction) { if (scalarFunction.arguments().size() < 2) { LOG_VALIDATION_MSG("Wrong number of arguments for " + name); } const auto& patternArg = scalarFunction.arguments()[1].value(); if (!patternArg.has_literal() || !patternArg.literal().has_string()) { LOG_VALIDATION_MSG("Pattern is not string literal for " + name); return false; } const auto& pattern = patternArg.literal().string(); std::string error; if (!validatePattern(pattern, error)) { LOG_VALIDATION_MSG(name + " due to " + error); return false; } return true; } bool SubstraitToVeloxPlanValidator::validateScalarFunction( const ::substrait::Expression::ScalarFunction& scalarFunction, const RowTypePtr& inputType) { std::vector<core::TypedExprPtr> params; params.reserve(scalarFunction.arguments().size()); for (const auto& argument : scalarFunction.arguments()) { if (argument.has_value() && !validateExpression(argument.value(), inputType)) { return false; } params.emplace_back(exprConverter_->toVeloxExpr(argument.value(), inputType)); } const auto& function = SubstraitParser::findFunctionSpec(planConverter_.getFunctionMap(), scalarFunction.function_reference()); const auto& name = SubstraitParser::getNameBeforeDelimiter(function); std::vector<std::string> types = SubstraitParser::getSubFunctionTypes(function); if (name == "round") { return validateRound(scalarFunction, inputType); } else if (name == "extract") { return validateExtractExpr(params); } else if (name == "char_length") { VELOX_CHECK(types.size() == 1); if (types[0] == "vbin") { LOG_VALIDATION_MSG("Binary type is not supported in " + name); return false; } } else if (name == "map_from_arrays") { LOG_VALIDATION_MSG("map_from_arrays is not supported."); return false; } else if (name == "get_array_item") { LOG_VALIDATION_MSG("get_array_item is not supported."); return false; } else if (name == "concat") { for (const auto& type : types) { if (type.find("struct") != std::string::npos || type.find("map") != std::string::npos || type.find("list") != std::string::npos) { LOG_VALIDATION_MSG(type + " is not supported in concat."); return false; } } } else if (name == "murmur3hash") { for (const auto& type : types) { if (type.find("struct") != std::string::npos || type.find("map") != std::string::npos || type.find("list") != std::string::npos) { LOG_VALIDATION_MSG(type + " is not supported in murmur3hash."); return false; } } } // Validate regex functions. if (kRegexFunctions.find(name) != kRegexFunctions.end()) { return validateRegexExpr(name, scalarFunction); } if (kBlackList.find(name) != kBlackList.end()) { LOG_VALIDATION_MSG("Function is not supported: " + name); return false; } return true; } bool SubstraitToVeloxPlanValidator::validateLiteral( const ::substrait::Expression_Literal& literal, const RowTypePtr& inputType) { if (literal.has_list()) { if (literal.list().values_size() == 0) { LOG_VALIDATION_MSG("Literal is a list but has no value."); return false; } else { for (auto child : literal.list().values()) { if (!validateLiteral(child, inputType)) { // the error msg has been set, so do not need to set it again. return false; } } } } else if (literal.has_map()) { if (literal.map().key_values().empty()) { LOG_VALIDATION_MSG("Literal is a map but has no value."); return false; } else { for (auto child : literal.map().key_values()) { if (!validateLiteral(child.key(), inputType) || !validateLiteral(child.value(), inputType)) { // the error msg has been set, so do not need to set it again. return false; } } } } return true; } bool SubstraitToVeloxPlanValidator::validateCast( const ::substrait::Expression::Cast& castExpr, const RowTypePtr& inputType) { if (!validateExpression(castExpr.input(), inputType)) { return false; } const auto& toType = SubstraitParser::parseType(castExpr.type()); if (toType->kind() == TypeKind::TIMESTAMP) { LOG_VALIDATION_MSG("Casting to TIMESTAMP is not supported."); return false; } core::TypedExprPtr input = exprConverter_->toVeloxExpr(castExpr.input(), inputType); // Casting from some types is not supported. See CastExpr::applyCast. if (input->type()->isDate()) { if (toType->kind() == TypeKind::TIMESTAMP) { LOG_VALIDATION_MSG("Casting from DATE to TIMESTAMP is not supported."); return false; } if (toType->kind() != TypeKind::VARCHAR) { LOG_VALIDATION_MSG("Casting from DATE to " + toType->toString() + " is not supported."); return false; } } switch (input->type()->kind()) { case TypeKind::ARRAY: case TypeKind::MAP: case TypeKind::ROW: case TypeKind::VARBINARY: LOG_VALIDATION_MSG("Invalid input type in casting: ARRAY/MAP/ROW/VARBINARY."); return false; case TypeKind::TIMESTAMP: { LOG_VALIDATION_MSG("Casting from TIMESTAMP is not supported or has incorrect result."); return false; } default: { } } return true; } bool SubstraitToVeloxPlanValidator::validateIfThen( const ::substrait::Expression_IfThen& ifThen, const RowTypePtr& inputType) { for (const auto& subIfThen : ifThen.ifs()) { return validateExpression(subIfThen.if_(), inputType) && validateExpression(subIfThen.then(), inputType); } return true; } bool SubstraitToVeloxPlanValidator::validateSingularOrList( const ::substrait::Expression::SingularOrList& singularOrList, const RowTypePtr& inputType) { for (const auto& option : singularOrList.options()) { if (!option.has_literal()) { LOG_VALIDATION_MSG("Option is expected as Literal."); return false; } if (!validateLiteral(option.literal(), inputType)) { return false; } } return validateExpression(singularOrList.value(), inputType); } bool SubstraitToVeloxPlanValidator::validateExpression( const ::substrait::Expression& expression, const RowTypePtr& inputType) { auto typeCase = expression.rex_type_case(); switch (typeCase) { case ::substrait::Expression::RexTypeCase::kScalarFunction: return validateScalarFunction(expression.scalar_function(), inputType); case ::substrait::Expression::RexTypeCase::kLiteral: return validateLiteral(expression.literal(), inputType); case ::substrait::Expression::RexTypeCase::kCast: return validateCast(expression.cast(), inputType); case ::substrait::Expression::RexTypeCase::kIfThen: return validateIfThen(expression.if_then(), inputType); case ::substrait::Expression::RexTypeCase::kSingularOrList: return validateSingularOrList(expression.singular_or_list(), inputType); default: return true; } } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::WriteRel& writeRel) { if (writeRel.has_input() && !validate(writeRel.input())) { LOG_VALIDATION_MSG("Validation failed for input type validation in WriteRel."); return false; } // Validate input data type. std::vector<TypePtr> types; if (writeRel.has_named_table()) { const auto& extension = writeRel.named_table().advanced_extension(); if (!validateInputTypes(extension, types)) { LOG_VALIDATION_MSG("Validation failed for input type validation in WriteRel."); return false; } } // Validate partition key type. if (writeRel.has_table_schema()) { const auto& tableSchema = writeRel.table_schema(); auto isPartitionColumns = SubstraitParser::parsePartitionColumns(tableSchema); for (auto i = 0; i < types.size(); i++) { if (isPartitionColumns[i]) { switch (types[i]->kind()) { case TypeKind::BOOLEAN: case TypeKind::TINYINT: case TypeKind::SMALLINT: case TypeKind::INTEGER: case TypeKind::BIGINT: case TypeKind::VARCHAR: case TypeKind::VARBINARY: break; default: LOG_VALIDATION_MSG( "Validation failed for input type validation in WriteRel, not support partition column type: " + mapTypeKindToName(types[i]->kind())); return false; } } } } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::FetchRel& fetchRel) { RowTypePtr rowType = nullptr; // Get and validate the input types from extension. if (fetchRel.has_advanced_extension()) { const auto& extension = fetchRel.advanced_extension(); std::vector<TypePtr> types; if (!validateInputTypes(extension, types)) { LOG_VALIDATION_MSG("Unsupported input types in ExpandRel."); return false; } int32_t inputPlanNodeId = 0; std::vector<std::string> names; names.reserve(types.size()); for (auto colIdx = 0; colIdx < types.size(); colIdx++) { names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx)); } rowType = std::make_shared<RowType>(std::move(names), std::move(types)); } if (fetchRel.offset() < 0 || fetchRel.count() < 0) { LOG_VALIDATION_MSG("Offset and count should be valid in FetchRel."); return false; } // Check the input of fetchRel, if it's sortRel, we need to check whether the sorting key is duplicated. bool topNFlag = false; if (fetchRel.has_input()) { topNFlag = fetchRel.input().has_sort(); if (topNFlag) { ::substrait::SortRel sortRel = fetchRel.input().sort(); auto [sortingKeys, sortingOrders] = planConverter_.processSortField(sortRel.sorts(), rowType); folly::F14FastSet<std::string> sortingKeyNames; for (const auto& sortingKey : sortingKeys) { auto result = sortingKeyNames.insert(sortingKey->name()); if (!result.second) { LOG_VALIDATION_MSG( "If the input of fetchRel is a SortRel, we will convert it to a TopNNode. In Velox, it is important to ensure unique sorting keys. However, duplicate keys were found in this case."); return false; } } } } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::GenerateRel& generateRel) { if (generateRel.has_input() && !validate(generateRel.input())) { LOG_VALIDATION_MSG("Input validation fails in GenerateRel."); return false; } // Get and validate the input types from extension. if (!generateRel.has_advanced_extension()) { LOG_VALIDATION_MSG("Input types are expected in GenerateRel."); return false; } const auto& extension = generateRel.advanced_extension(); std::vector<TypePtr> types; if (!validateInputTypes(extension, types)) { LOG_VALIDATION_MSG("Validation failed for input types in GenerateRel."); return false; } int32_t inputPlanNodeId = 0; // Create the fake input names to be used in row type. std::vector<std::string> names; names.reserve(types.size()); for (uint32_t colIdx = 0; colIdx < types.size(); colIdx++) { names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx)); } auto rowType = std::make_shared<RowType>(std::move(names), std::move(types)); if (generateRel.has_generator() && !validateExpression(generateRel.generator(), rowType)) { LOG_VALIDATION_MSG("Input validation fails in GenerateRel."); return false; } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::ExpandRel& expandRel) { if (expandRel.has_input() && !validate(expandRel.input())) { LOG_VALIDATION_MSG("Input validation fails in ExpandRel."); return false; } RowTypePtr rowType = nullptr; // Get and validate the input types from extension. if (expandRel.has_advanced_extension()) { const auto& extension = expandRel.advanced_extension(); std::vector<TypePtr> types; if (!validateInputTypes(extension, types)) { LOG_VALIDATION_MSG("Unsupported input types in ExpandRel."); return false; } int32_t inputPlanNodeId = 0; std::vector<std::string> names; names.reserve(types.size()); for (auto colIdx = 0; colIdx < types.size(); colIdx++) { names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx)); } rowType = std::make_shared<RowType>(std::move(names), std::move(types)); } int32_t projectSize = 0; // Validate fields. for (const auto& fields : expandRel.fields()) { std::vector<core::TypedExprPtr> expressions; if (fields.has_switching_field()) { auto projectExprs = fields.switching_field().duplicates(); expressions.reserve(projectExprs.size()); if (projectSize == 0) { projectSize = projectExprs.size(); } else if (projectSize != projectExprs.size()) { LOG_VALIDATION_MSG("SwitchingField expressions size should be constant in ExpandRel."); return false; } try { for (const auto& projectExpr : projectExprs) { const auto& typeCase = projectExpr.rex_type_case(); switch (typeCase) { case ::substrait::Expression::RexTypeCase::kSelection: case ::substrait::Expression::RexTypeCase::kLiteral: break; default: LOG_VALIDATION_MSG("Only field or literal is supported in project of ExpandRel."); return false; } if (rowType) { expressions.emplace_back(exprConverter_->toVeloxExpr(projectExpr, rowType)); } } if (rowType) { // Try to compile the expressions. If there is any unregistered // function or mismatched type, exception will be thrown. exec::ExprSet exprSet(std::move(expressions), execCtx_); } } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } } else { LOG_VALIDATION_MSG("Only SwitchingField is supported in ExpandRel."); return false; } } return true; } bool validateBoundType(::substrait::Expression_WindowFunction_Bound boundType) { switch (boundType.kind_case()) { case ::substrait::Expression_WindowFunction_Bound::kUnboundedFollowing: case ::substrait::Expression_WindowFunction_Bound::kUnboundedPreceding: case ::substrait::Expression_WindowFunction_Bound::kCurrentRow: case ::substrait::Expression_WindowFunction_Bound::kFollowing: case ::substrait::Expression_WindowFunction_Bound::kPreceding: break; default: return false; } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::WindowRel& windowRel) { if (windowRel.has_input() && !validate(windowRel.input())) { LOG_VALIDATION_MSG("WindowRel input fails to validate."); return false; } // Get and validate the input types from extension. if (!windowRel.has_advanced_extension()) { LOG_VALIDATION_MSG("Input types are expected in WindowRel."); return false; } const auto& extension = windowRel.advanced_extension(); std::vector<TypePtr> types; if (!validateInputTypes(extension, types)) { LOG_VALIDATION_MSG("Validation failed for input types in WindowRel."); return false; } int32_t inputPlanNodeId = 0; std::vector<std::string> names; names.reserve(types.size()); for (auto colIdx = 0; colIdx < types.size(); colIdx++) { names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx)); } auto rowType = std::make_shared<RowType>(std::move(names), std::move(types)); // Validate WindowFunction std::vector<std::string> funcSpecs; funcSpecs.reserve(windowRel.measures().size()); for (const auto& smea : windowRel.measures()) { try { const auto& windowFunction = smea.measure(); funcSpecs.emplace_back(planConverter_.findFuncSpec(windowFunction.function_reference())); SubstraitParser::parseType(windowFunction.output_type()); for (const auto& arg : windowFunction.arguments()) { auto typeCase = arg.value().rex_type_case(); switch (typeCase) { case ::substrait::Expression::RexTypeCase::kSelection: case ::substrait::Expression::RexTypeCase::kLiteral: break; default: LOG_VALIDATION_MSG("Only field is supported in window functions."); return false; } } // Validate BoundType and Frame Type switch (windowFunction.window_type()) { case ::substrait::WindowType::ROWS: case ::substrait::WindowType::RANGE: break; default: LOG_VALIDATION_MSG( "the window type only support ROWS and RANGE, and the input type is " + std::to_string(windowFunction.window_type())); return false; } bool boundTypeSupported = validateBoundType(windowFunction.upper_bound()) && validateBoundType(windowFunction.lower_bound()); if (!boundTypeSupported) { LOG_VALIDATION_MSG( "Found unsupported Bound Type: upper " + std::to_string(windowFunction.upper_bound().kind_case()) + ", lower " + std::to_string(windowFunction.lower_bound().kind_case())); return false; } } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } } // Validate supported aggregate functions. static const std::unordered_set<std::string> unsupportedFuncs = {"collect_list", "collect_set"}; for (const auto& funcSpec : funcSpecs) { auto funcName = SubstraitParser::getNameBeforeDelimiter(funcSpec); if (unsupportedFuncs.find(funcName) != unsupportedFuncs.end()) { LOG_VALIDATION_MSG(funcName + " was not supported in WindowRel."); return false; } } // Validate groupby expression const auto& groupByExprs = windowRel.partition_expressions(); std::vector<core::TypedExprPtr> expressions; expressions.reserve(groupByExprs.size()); try { for (const auto& expr : groupByExprs) { auto expression = exprConverter_->toVeloxExpr(expr, rowType); auto expr_field = dynamic_cast<const core::FieldAccessTypedExpr*>(expression.get()); if (expr_field == nullptr) { LOG_VALIDATION_MSG("Only field is supported for partition key in Window Operator!"); return false; } else { expressions.emplace_back(expression); } } // Try to compile the expressions. If there is any unregistred funciton or // mismatched type, exception will be thrown. exec::ExprSet exprSet(std::move(expressions), execCtx_); } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } // Validate Sort expression const auto& sorts = windowRel.sorts(); for (const auto& sort : sorts) { switch (sort.direction()) { case ::substrait::SortField_SortDirection_SORT_DIRECTION_ASC_NULLS_FIRST: case ::substrait::SortField_SortDirection_SORT_DIRECTION_ASC_NULLS_LAST: case ::substrait::SortField_SortDirection_SORT_DIRECTION_DESC_NULLS_FIRST: case ::substrait::SortField_SortDirection_SORT_DIRECTION_DESC_NULLS_LAST: break; default: LOG_VALIDATION_MSG("in windowRel, unsupported Sort direction " + std::to_string(sort.direction())); return false; } if (sort.has_expr()) { try { auto expression = exprConverter_->toVeloxExpr(sort.expr(), rowType); auto expr_field = dynamic_cast<const core::FieldAccessTypedExpr*>(expression.get()); if (!expr_field) { LOG_VALIDATION_MSG("in windowRel, the sorting key in Sort Operator only support field."); return false; } exec::ExprSet exprSet({std::move(expression)}, execCtx_); } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } } } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::SortRel& sortRel) { if (sortRel.has_input() && !validate(sortRel.input())) { return false; } // Get and validate the input types from extension. if (!sortRel.has_advanced_extension()) { LOG_VALIDATION_MSG("Input types are expected in SortRel."); return false; } const auto& extension = sortRel.advanced_extension(); std::vector<TypePtr> types; if (!validateInputTypes(extension, types)) { LOG_VALIDATION_MSG("Validation failed for input types in SortRel."); return false; } int32_t inputPlanNodeId = 0; std::vector<std::string> names; names.reserve(types.size()); for (auto colIdx = 0; colIdx < types.size(); colIdx++) { names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx)); } auto rowType = std::make_shared<RowType>(std::move(names), std::move(types)); const auto& sorts = sortRel.sorts(); for (const auto& sort : sorts) { switch (sort.direction()) { case ::substrait::SortField_SortDirection_SORT_DIRECTION_ASC_NULLS_FIRST: case ::substrait::SortField_SortDirection_SORT_DIRECTION_ASC_NULLS_LAST: case ::substrait::SortField_SortDirection_SORT_DIRECTION_DESC_NULLS_FIRST: case ::substrait::SortField_SortDirection_SORT_DIRECTION_DESC_NULLS_LAST: break; default: LOG_VALIDATION_MSG("unsupported Sort direction " + std::to_string(sort.direction())); return false; } if (sort.has_expr()) { try { auto expression = exprConverter_->toVeloxExpr(sort.expr(), rowType); auto expr_field = dynamic_cast<const core::FieldAccessTypedExpr*>(expression.get()); if (!expr_field) { LOG_VALIDATION_MSG("in SortRel, the sorting key in Sort Operator only support field."); return false; } exec::ExprSet exprSet({std::move(expression)}, execCtx_); } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } } } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::ProjectRel& projectRel) { if (projectRel.has_input() && !validate(projectRel.input())) { LOG_VALIDATION_MSG("ProjectRel input"); return false; } // Get and validate the input types from extension. if (!projectRel.has_advanced_extension()) { LOG_VALIDATION_MSG("Input types are expected in ProjectRel."); return false; } const auto& extension = projectRel.advanced_extension(); std::vector<TypePtr> types; if (!validateInputTypes(extension, types)) { LOG_VALIDATION_MSG("Validation failed for input types in ProjectRel."); return false; } int32_t inputPlanNodeId = 0; // Create the fake input names to be used in row type. std::vector<std::string> names; names.reserve(types.size()); for (uint32_t colIdx = 0; colIdx < types.size(); colIdx++) { names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx)); } auto rowType = std::make_shared<RowType>(std::move(names), std::move(types)); // Validate the project expressions. const auto& projectExprs = projectRel.expressions(); std::vector<core::TypedExprPtr> expressions; expressions.reserve(projectExprs.size()); try { for (const auto& expr : projectExprs) { if (!validateExpression(expr, rowType)) { return false; } expressions.emplace_back(exprConverter_->toVeloxExpr(expr, rowType)); } // Try to compile the expressions. If there is any unregistered function or // mismatched type, exception will be thrown. exec::ExprSet exprSet(std::move(expressions), execCtx_); } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::FilterRel& filterRel) { if (filterRel.has_input() && !validate(filterRel.input())) { LOG_VALIDATION_MSG("input of FilterRel validation fails"); return false; } // Get and validate the input types from extension. if (!filterRel.has_advanced_extension()) { LOG_VALIDATION_MSG("Input types are expected in FilterRel."); return false; } const auto& extension = filterRel.advanced_extension(); std::vector<TypePtr> types; if (!validateInputTypes(extension, types)) { LOG_VALIDATION_MSG("Validation failed for input types in FilterRel."); return false; } int32_t inputPlanNodeId = 0; // Create the fake input names to be used in row type. std::vector<std::string> names; names.reserve(types.size()); for (uint32_t colIdx = 0; colIdx < types.size(); colIdx++) { names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx)); } auto rowType = std::make_shared<RowType>(std::move(names), std::move(types)); std::vector<core::TypedExprPtr> expressions; try { if (!validateExpression(filterRel.condition(), rowType)) { return false; } expressions.emplace_back(exprConverter_->toVeloxExpr(filterRel.condition(), rowType)); // Try to compile the expressions. If there is any unregistered function // or mismatched type, exception will be thrown. exec::ExprSet exprSet(std::move(expressions), execCtx_); } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::JoinRel& joinRel) { if (joinRel.has_left() && !validate(joinRel.left())) { LOG_VALIDATION_MSG("Validation fails for join left input."); return false; } if (joinRel.has_right() && !validate(joinRel.right())) { LOG_VALIDATION_MSG("Validation fails for join right input."); return false; } if (joinRel.has_advanced_extension() && SubstraitParser::configSetInOptimization(joinRel.advanced_extension(), "isSMJ=")) { switch (joinRel.type()) { case ::substrait::JoinRel_JoinType_JOIN_TYPE_INNER: case ::substrait::JoinRel_JoinType_JOIN_TYPE_LEFT: break; default: LOG_VALIDATION_MSG("Sort merge join only support inner and left join."); return false; } } switch (joinRel.type()) { case ::substrait::JoinRel_JoinType_JOIN_TYPE_INNER: case ::substrait::JoinRel_JoinType_JOIN_TYPE_OUTER: case ::substrait::JoinRel_JoinType_JOIN_TYPE_LEFT: case ::substrait::JoinRel_JoinType_JOIN_TYPE_RIGHT: case ::substrait::JoinRel_JoinType_JOIN_TYPE_LEFT_SEMI: case ::substrait::JoinRel_JoinType_JOIN_TYPE_RIGHT_SEMI: case ::substrait::JoinRel_JoinType_JOIN_TYPE_ANTI: break; default: LOG_VALIDATION_MSG("Sort merge join only support inner and left join."); return false; } // Validate input types. if (!joinRel.has_advanced_extension()) { LOG_VALIDATION_MSG("Input types are expected in JoinRel."); return false; } const auto& extension = joinRel.advanced_extension(); std::vector<TypePtr> types; if (!validateInputTypes(extension, types)) { LOG_VALIDATION_MSG("Validation failed for input types in JoinRel."); return false; } int32_t inputPlanNodeId = 0; std::vector<std::string> names; names.reserve(types.size()); for (auto colIdx = 0; colIdx < types.size(); colIdx++) { names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx)); } auto rowType = std::make_shared<RowType>(std::move(names), std::move(types)); if (joinRel.has_expression()) { std::vector<const ::substrait::Expression::FieldReference*> leftExprs, rightExprs; try { planConverter_.extractJoinKeys(joinRel.expression(), leftExprs, rightExprs); } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } } if (joinRel.has_post_join_filter()) { try { auto expression = exprConverter_->toVeloxExpr(joinRel.post_join_filter(), rowType); exec::ExprSet exprSet({std::move(expression)}, execCtx_); } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::CrossRel& crossRel) { if (crossRel.has_left() && !validate(crossRel.left())) { logValidateMsg("native validation failed due to: validation fails for cross join left input. "); return false; } if (crossRel.has_right() && !validate(crossRel.right())) { logValidateMsg("native validation failed due to: validation fails for cross join right input. "); return false; } // Validate input types. if (!crossRel.has_advanced_extension()) { logValidateMsg("native validation failed due to: Input types are expected in CrossRel."); return false; } const auto& extension = crossRel.advanced_extension(); std::vector<TypePtr> types; if (!validateInputTypes(extension, types)) { logValidateMsg("native validation failed due to: Validation failed for input types in CrossRel"); return false; } int32_t inputPlanNodeId = 0; std::vector<std::string> names; names.reserve(types.size()); for (auto colIdx = 0; colIdx < types.size(); colIdx++) { names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx)); } auto rowType = std::make_shared<RowType>(std::move(names), std::move(types)); if (crossRel.has_expression()) { try { auto expression = exprConverter_->toVeloxExpr(crossRel.expression(), rowType); exec::ExprSet exprSet({std::move(expression)}, execCtx_); } catch (const VeloxException& err) { logValidateMsg("native validation failed due to: crossRel expression validation fails, " + err.message()); return false; } } return true; } bool SubstraitToVeloxPlanValidator::validateAggRelFunctionType(const ::substrait::AggregateRel& aggRel) { if (aggRel.measures_size() == 0) { return true; } for (const auto& smea : aggRel.measures()) { const auto& aggFunction = smea.measure(); const auto& funcStep = planConverter_.toAggregationFunctionStep(aggFunction); auto funcSpec = planConverter_.findFuncSpec(aggFunction.function_reference()); std::vector<TypePtr> types; bool isDecimal = false; try { types = SubstraitParser::sigToTypes(funcSpec); for (const auto& type : types) { if (!isDecimal && type->isDecimal()) { isDecimal = true; } } } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } auto baseFuncName = SubstraitParser::mapToVeloxFunction(SubstraitParser::getNameBeforeDelimiter(funcSpec), isDecimal); auto funcName = planConverter_.toAggregationFunctionName(baseFuncName, funcStep); auto signaturesOpt = exec::getAggregateFunctionSignatures(funcName); if (!signaturesOpt) { LOG_VALIDATION_MSG("can not find function signature for" + funcName + " in AggregateRel."); return false; } bool resolved = false; for (const auto& signature : signaturesOpt.value()) { exec::SignatureBinder binder(*signature, types); if (binder.tryBind()) { auto resolveType = binder.tryResolveType( exec::isPartialOutput(funcStep) ? signature->intermediateType() : signature->returnType()); if (resolveType == nullptr) { LOG_VALIDATION_MSG("Validation failed for function " + funcName + " resolve type in AggregateRel."); return false; } resolved = true; break; } } if (!resolved) { LOG_VALIDATION_MSG("Validation failed for function " + funcName + " bind signatures in AggregateRel."); return false; } } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::AggregateRel& aggRel) { if (aggRel.has_input() && !validate(aggRel.input())) { LOG_VALIDATION_MSG("Input validation fails in AggregateRel."); return false; } // Validate input types. if (aggRel.has_advanced_extension()) { std::vector<TypePtr> types; const auto& extension = aggRel.advanced_extension(); // Aggregate always has advanced extension for streaming aggregate optimization, // but only some of them have enhancement for validation. if (extension.has_enhancement() && !validateInputTypes(extension, types)) { LOG_VALIDATION_MSG("Validation failed for input types in AggregateRel."); return false; } } // Validate groupings. for (const auto& grouping : aggRel.groupings()) { for (const auto& groupingExpr : grouping.grouping_expressions()) { const auto& typeCase = groupingExpr.rex_type_case(); switch (typeCase) { case ::substrait::Expression::RexTypeCase::kSelection: break; default: LOG_VALIDATION_MSG("Only field is supported in groupings."); return false; } } } // Validate aggregate functions. std::vector<std::string> funcSpecs; funcSpecs.reserve(aggRel.measures().size()); for (const auto& smea : aggRel.measures()) { try { // Validate the filter expression if (smea.has_filter()) { ::substrait::Expression aggRelMask = smea.filter(); if (aggRelMask.ByteSizeLong() > 0) { auto typeCase = aggRelMask.rex_type_case(); switch (typeCase) { case ::substrait::Expression::RexTypeCase::kSelection: break; default: LOG_VALIDATION_MSG("Only field is supported in aggregate filter expression."); return false; } } } const auto& aggFunction = smea.measure(); const auto& functionSpec = planConverter_.findFuncSpec(aggFunction.function_reference()); funcSpecs.emplace_back(functionSpec); SubstraitParser::parseType(aggFunction.output_type()); // Validate the size of arguments. if (SubstraitParser::getNameBeforeDelimiter(functionSpec) == "count" && aggFunction.arguments().size() > 1) { LOG_VALIDATION_MSG("Count should have only one argument."); // Count accepts only one argument. return false; } for (const auto& arg : aggFunction.arguments()) { auto typeCase = arg.value().rex_type_case(); switch (typeCase) { case ::substrait::Expression::RexTypeCase::kSelection: case ::substrait::Expression::RexTypeCase::kLiteral: break; default: LOG_VALIDATION_MSG("Only field is supported in aggregate functions."); return false; } } } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } } // The supported aggregation functions. TODO: Remove this set when Presto aggregate functions in Velox are not // needed to be registered. static const std::unordered_set<std::string> supportedAggFuncs = { "sum", "collect_set", "count", "avg", "min", "max", "min_by", "max_by", "stddev_samp", "stddev_pop", "bloom_filter_agg", "var_samp", "var_pop", "bit_and", "bit_or", "bit_xor", "first", "first_ignore_null", "last", "last_ignore_null", "corr", "covar_pop", "covar_samp", "approx_distinct"}; for (const auto& funcSpec : funcSpecs) { auto funcName = SubstraitParser::getNameBeforeDelimiter(funcSpec); if (supportedAggFuncs.find(funcName) == supportedAggFuncs.end()) { LOG_VALIDATION_MSG(funcName + " was not supported in AggregateRel."); return false; } } if (!validateAggRelFunctionType(aggRel)) { return false; } // Validate both groupby and aggregates input are empty, which is corner case. if (aggRel.measures_size() == 0) { bool hasExpr = false; for (const auto& grouping : aggRel.groupings()) { if (grouping.grouping_expressions().size() > 0) { hasExpr = true; break; } } if (!hasExpr) { LOG_VALIDATION_MSG("Aggregation must specify either grouping keys or aggregates."); return false; } } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::ReadRel& readRel) { try { planConverter_.toVeloxPlan(readRel); } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } // Validate filter in ReadRel. if (readRel.has_filter()) { std::vector<TypePtr> veloxTypeList; if (readRel.has_base_schema()) { const auto& baseSchema = readRel.base_schema(); veloxTypeList = SubstraitParser::parseNamedStruct(baseSchema); } int32_t inputPlanNodeId = 0; std::vector<std::string> names; names.reserve(veloxTypeList.size()); for (auto colIdx = 0; colIdx < veloxTypeList.size(); colIdx++) { names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx)); } auto rowType = std::make_shared<RowType>(std::move(names), std::move(veloxTypeList)); std::vector<core::TypedExprPtr> expressions; try { if (!validateExpression(readRel.filter(), rowType)) { return false; } expressions.emplace_back(exprConverter_->toVeloxExpr(readRel.filter(), rowType)); // Try to compile the expressions. If there is any unregistered function // or mismatched type, exception will be thrown. exec::ExprSet exprSet(std::move(expressions), execCtx_); } catch (const VeloxException& err) { LOG_VALIDATION_MSG_FROM_EXCEPTION(err); return false; } } return true; } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::Rel& rel) { if (rel.has_aggregate()) { return validate(rel.aggregate()); } else if (rel.has_project()) { return validate(rel.project()); } else if (rel.has_filter()) { return validate(rel.filter()); } else if (rel.has_join()) { return validate(rel.join()); } else if (rel.has_cross()) { return validate(rel.cross()); } else if (rel.has_read()) { return validate(rel.read()); } else if (rel.has_sort()) { return validate(rel.sort()); } else if (rel.has_expand()) { return validate(rel.expand()); } else if (rel.has_generate()) { return validate(rel.generate()); } else if (rel.has_fetch()) { return validate(rel.fetch()); } else if (rel.has_window()) { return validate(rel.window()); } else if (rel.has_write()) { return validate(rel.write()); } else { return false; } } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::RelRoot& relRoot) { if (relRoot.has_input()) { return validate(relRoot.input()); } else { return false; } } bool SubstraitToVeloxPlanValidator::validate(const ::substrait::Plan& plan) { // Create plan converter and expression converter to help the validation. planConverter_.constructFunctionMap(plan); exprConverter_ = planConverter_.getExprConverter(); for (const auto& rel : plan.relations()) { if (rel.has_root()) { return validate(rel.root()); } else if (rel.has_rel()) { return validate(rel.rel()); } } return false; } } // namespace gluten