cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc (1,086 lines of code) (raw):
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "SubstraitToVeloxPlanValidator.h"
#include <google/protobuf/wrappers.pb.h>
#include <re2/re2.h>
#include <string>
#include "TypeUtils.h"
#include "utils/Common.h"
#include "velox/core/ExpressionEvaluator.h"
#include "velox/exec/Aggregate.h"
#include "velox/expression/Expr.h"
#include "velox/expression/SignatureBinder.h"
namespace gluten {
namespace {
static const char* extractFileName(const char* file) {
return strrchr(file, '/') ? strrchr(file, '/') + 1 : file;
}
#define LOG_VALIDATION_MSG_FROM_EXCEPTION(err) \
logValidateMsg(fmt::format( \
"Validation failed due to exception caught at file:{} line:{} function:{}, thrown from file:{} line:{} function:{}, reason:{}", \
extractFileName(__FILE__), \
__LINE__, \
__FUNCTION__, \
extractFileName(err.file()), \
err.line(), \
err.function(), \
err.message()))
#define LOG_VALIDATION_MSG(reason) \
logValidateMsg(fmt::format( \
"Validation failed at file:{}, line:{}, function:{}, reason:{}", \
extractFileName(__FILE__), \
__LINE__, \
__FUNCTION__, \
reason))
static const std::unordered_set<std::string> kRegexFunctions = {
"regexp_extract",
"regexp_extract_all",
"regexp_replace",
"rlike"};
static const std::unordered_set<std::string> kBlackList = {
"split_part",
"factorial",
"concat_ws",
"from_json",
"json_array_length",
"repeat",
"trunc",
"sequence",
"posexplode",
"arrays_overlap",
"approx_percentile",
"get_array_struct_fields"};
} // namespace
bool SubstraitToVeloxPlanValidator::validateInputTypes(
const ::substrait::extensions::AdvancedExtension& extension,
std::vector<TypePtr>& types) {
// The input type is wrapped in enhancement.
if (!extension.has_enhancement()) {
LOG_VALIDATION_MSG("Input type is not wrapped in enhancement.");
return false;
}
const auto& enhancement = extension.enhancement();
::substrait::Type inputType;
if (!enhancement.UnpackTo(&inputType)) {
LOG_VALIDATION_MSG("Enhancement can't be unpacked to inputType.");
return false;
}
if (!inputType.has_struct_()) {
LOG_VALIDATION_MSG("Input type has no struct.");
return false;
}
// Get the input types.
const auto& sTypes = inputType.struct_().types();
for (const auto& sType : sTypes) {
try {
types.emplace_back(SubstraitParser::parseType(sType));
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validateRound(
const ::substrait::Expression::ScalarFunction& scalarFunction,
const RowTypePtr& inputType) {
const auto& arguments = scalarFunction.arguments();
if (arguments.size() < 2) {
return false;
}
if (!arguments[1].value().has_literal()) {
LOG_VALIDATION_MSG("Round scale is expected.");
return false;
}
// Velox has different result with Spark on negative scale.
auto typeCase = arguments[1].value().literal().literal_type_case();
switch (typeCase) {
case ::substrait::Expression_Literal::LiteralTypeCase::kI32:
return (arguments[1].value().literal().i32() >= 0);
case ::substrait::Expression_Literal::LiteralTypeCase::kI64:
return (arguments[1].value().literal().i64() >= 0);
default:
LOG_VALIDATION_MSG("Round scale validation is not supported for type case " + std::to_string(typeCase));
return false;
}
}
bool SubstraitToVeloxPlanValidator::validateExtractExpr(const std::vector<core::TypedExprPtr>& params) {
if (params.size() != 2) {
LOG_VALIDATION_MSG("Value expected in variant in ExtractExpr.");
return false;
}
auto functionArg = std::dynamic_pointer_cast<const core::ConstantTypedExpr>(params[0]);
if (functionArg) {
// Get the function argument.
const auto& variant = functionArg->value();
if (!variant.hasValue()) {
LOG_VALIDATION_MSG("Value expected in variant in ExtractExpr.");
return false;
}
// The first parameter specifies extracting from which field.
const auto& from = variant.value<std::string>();
// Hour causes incorrect result.
if (from == "HOUR") {
LOG_VALIDATION_MSG("Extract from hour.");
return false;
}
return true;
}
LOG_VALIDATION_MSG("Constant is expected to be the first parameter in extract.");
return false;
}
bool SubstraitToVeloxPlanValidator::validateRegexExpr(
const std::string& name,
const ::substrait::Expression::ScalarFunction& scalarFunction) {
if (scalarFunction.arguments().size() < 2) {
LOG_VALIDATION_MSG("Wrong number of arguments for " + name);
}
const auto& patternArg = scalarFunction.arguments()[1].value();
if (!patternArg.has_literal() || !patternArg.literal().has_string()) {
LOG_VALIDATION_MSG("Pattern is not string literal for " + name);
return false;
}
const auto& pattern = patternArg.literal().string();
std::string error;
if (!validatePattern(pattern, error)) {
LOG_VALIDATION_MSG(name + " due to " + error);
return false;
}
return true;
}
bool SubstraitToVeloxPlanValidator::validateScalarFunction(
const ::substrait::Expression::ScalarFunction& scalarFunction,
const RowTypePtr& inputType) {
std::vector<core::TypedExprPtr> params;
params.reserve(scalarFunction.arguments().size());
for (const auto& argument : scalarFunction.arguments()) {
if (argument.has_value() && !validateExpression(argument.value(), inputType)) {
return false;
}
params.emplace_back(exprConverter_->toVeloxExpr(argument.value(), inputType));
}
const auto& function =
SubstraitParser::findFunctionSpec(planConverter_.getFunctionMap(), scalarFunction.function_reference());
const auto& name = SubstraitParser::getNameBeforeDelimiter(function);
std::vector<std::string> types = SubstraitParser::getSubFunctionTypes(function);
if (name == "round") {
return validateRound(scalarFunction, inputType);
} else if (name == "extract") {
return validateExtractExpr(params);
} else if (name == "char_length") {
VELOX_CHECK(types.size() == 1);
if (types[0] == "vbin") {
LOG_VALIDATION_MSG("Binary type is not supported in " + name);
return false;
}
} else if (name == "map_from_arrays") {
LOG_VALIDATION_MSG("map_from_arrays is not supported.");
return false;
} else if (name == "get_array_item") {
LOG_VALIDATION_MSG("get_array_item is not supported.");
return false;
} else if (name == "concat") {
for (const auto& type : types) {
if (type.find("struct") != std::string::npos || type.find("map") != std::string::npos ||
type.find("list") != std::string::npos) {
LOG_VALIDATION_MSG(type + " is not supported in concat.");
return false;
}
}
} else if (name == "murmur3hash") {
for (const auto& type : types) {
if (type.find("struct") != std::string::npos || type.find("map") != std::string::npos ||
type.find("list") != std::string::npos) {
LOG_VALIDATION_MSG(type + " is not supported in murmur3hash.");
return false;
}
}
}
// Validate regex functions.
if (kRegexFunctions.find(name) != kRegexFunctions.end()) {
return validateRegexExpr(name, scalarFunction);
}
if (kBlackList.find(name) != kBlackList.end()) {
LOG_VALIDATION_MSG("Function is not supported: " + name);
return false;
}
return true;
}
bool SubstraitToVeloxPlanValidator::validateLiteral(
const ::substrait::Expression_Literal& literal,
const RowTypePtr& inputType) {
if (literal.has_list()) {
if (literal.list().values_size() == 0) {
LOG_VALIDATION_MSG("Literal is a list but has no value.");
return false;
} else {
for (auto child : literal.list().values()) {
if (!validateLiteral(child, inputType)) {
// the error msg has been set, so do not need to set it again.
return false;
}
}
}
} else if (literal.has_map()) {
if (literal.map().key_values().empty()) {
LOG_VALIDATION_MSG("Literal is a map but has no value.");
return false;
} else {
for (auto child : literal.map().key_values()) {
if (!validateLiteral(child.key(), inputType) || !validateLiteral(child.value(), inputType)) {
// the error msg has been set, so do not need to set it again.
return false;
}
}
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validateCast(
const ::substrait::Expression::Cast& castExpr,
const RowTypePtr& inputType) {
if (!validateExpression(castExpr.input(), inputType)) {
return false;
}
const auto& toType = SubstraitParser::parseType(castExpr.type());
if (toType->kind() == TypeKind::TIMESTAMP) {
LOG_VALIDATION_MSG("Casting to TIMESTAMP is not supported.");
return false;
}
core::TypedExprPtr input = exprConverter_->toVeloxExpr(castExpr.input(), inputType);
// Casting from some types is not supported. See CastExpr::applyCast.
if (input->type()->isDate()) {
if (toType->kind() == TypeKind::TIMESTAMP) {
LOG_VALIDATION_MSG("Casting from DATE to TIMESTAMP is not supported.");
return false;
}
if (toType->kind() != TypeKind::VARCHAR) {
LOG_VALIDATION_MSG("Casting from DATE to " + toType->toString() + " is not supported.");
return false;
}
}
switch (input->type()->kind()) {
case TypeKind::ARRAY:
case TypeKind::MAP:
case TypeKind::ROW:
case TypeKind::VARBINARY:
LOG_VALIDATION_MSG("Invalid input type in casting: ARRAY/MAP/ROW/VARBINARY.");
return false;
case TypeKind::TIMESTAMP: {
LOG_VALIDATION_MSG("Casting from TIMESTAMP is not supported or has incorrect result.");
return false;
}
default: {
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validateIfThen(
const ::substrait::Expression_IfThen& ifThen,
const RowTypePtr& inputType) {
for (const auto& subIfThen : ifThen.ifs()) {
return validateExpression(subIfThen.if_(), inputType) && validateExpression(subIfThen.then(), inputType);
}
return true;
}
bool SubstraitToVeloxPlanValidator::validateSingularOrList(
const ::substrait::Expression::SingularOrList& singularOrList,
const RowTypePtr& inputType) {
for (const auto& option : singularOrList.options()) {
if (!option.has_literal()) {
LOG_VALIDATION_MSG("Option is expected as Literal.");
return false;
}
if (!validateLiteral(option.literal(), inputType)) {
return false;
}
}
return validateExpression(singularOrList.value(), inputType);
}
bool SubstraitToVeloxPlanValidator::validateExpression(
const ::substrait::Expression& expression,
const RowTypePtr& inputType) {
auto typeCase = expression.rex_type_case();
switch (typeCase) {
case ::substrait::Expression::RexTypeCase::kScalarFunction:
return validateScalarFunction(expression.scalar_function(), inputType);
case ::substrait::Expression::RexTypeCase::kLiteral:
return validateLiteral(expression.literal(), inputType);
case ::substrait::Expression::RexTypeCase::kCast:
return validateCast(expression.cast(), inputType);
case ::substrait::Expression::RexTypeCase::kIfThen:
return validateIfThen(expression.if_then(), inputType);
case ::substrait::Expression::RexTypeCase::kSingularOrList:
return validateSingularOrList(expression.singular_or_list(), inputType);
default:
return true;
}
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::WriteRel& writeRel) {
if (writeRel.has_input() && !validate(writeRel.input())) {
LOG_VALIDATION_MSG("Validation failed for input type validation in WriteRel.");
return false;
}
// Validate input data type.
std::vector<TypePtr> types;
if (writeRel.has_named_table()) {
const auto& extension = writeRel.named_table().advanced_extension();
if (!validateInputTypes(extension, types)) {
LOG_VALIDATION_MSG("Validation failed for input type validation in WriteRel.");
return false;
}
}
// Validate partition key type.
if (writeRel.has_table_schema()) {
const auto& tableSchema = writeRel.table_schema();
auto isPartitionColumns = SubstraitParser::parsePartitionColumns(tableSchema);
for (auto i = 0; i < types.size(); i++) {
if (isPartitionColumns[i]) {
switch (types[i]->kind()) {
case TypeKind::BOOLEAN:
case TypeKind::TINYINT:
case TypeKind::SMALLINT:
case TypeKind::INTEGER:
case TypeKind::BIGINT:
case TypeKind::VARCHAR:
case TypeKind::VARBINARY:
break;
default:
LOG_VALIDATION_MSG(
"Validation failed for input type validation in WriteRel, not support partition column type: " +
mapTypeKindToName(types[i]->kind()));
return false;
}
}
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::FetchRel& fetchRel) {
RowTypePtr rowType = nullptr;
// Get and validate the input types from extension.
if (fetchRel.has_advanced_extension()) {
const auto& extension = fetchRel.advanced_extension();
std::vector<TypePtr> types;
if (!validateInputTypes(extension, types)) {
LOG_VALIDATION_MSG("Unsupported input types in ExpandRel.");
return false;
}
int32_t inputPlanNodeId = 0;
std::vector<std::string> names;
names.reserve(types.size());
for (auto colIdx = 0; colIdx < types.size(); colIdx++) {
names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx));
}
rowType = std::make_shared<RowType>(std::move(names), std::move(types));
}
if (fetchRel.offset() < 0 || fetchRel.count() < 0) {
LOG_VALIDATION_MSG("Offset and count should be valid in FetchRel.");
return false;
}
// Check the input of fetchRel, if it's sortRel, we need to check whether the sorting key is duplicated.
bool topNFlag = false;
if (fetchRel.has_input()) {
topNFlag = fetchRel.input().has_sort();
if (topNFlag) {
::substrait::SortRel sortRel = fetchRel.input().sort();
auto [sortingKeys, sortingOrders] = planConverter_.processSortField(sortRel.sorts(), rowType);
folly::F14FastSet<std::string> sortingKeyNames;
for (const auto& sortingKey : sortingKeys) {
auto result = sortingKeyNames.insert(sortingKey->name());
if (!result.second) {
LOG_VALIDATION_MSG(
"If the input of fetchRel is a SortRel, we will convert it to a TopNNode. In Velox, it is important to ensure unique sorting keys. However, duplicate keys were found in this case.");
return false;
}
}
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::GenerateRel& generateRel) {
if (generateRel.has_input() && !validate(generateRel.input())) {
LOG_VALIDATION_MSG("Input validation fails in GenerateRel.");
return false;
}
// Get and validate the input types from extension.
if (!generateRel.has_advanced_extension()) {
LOG_VALIDATION_MSG("Input types are expected in GenerateRel.");
return false;
}
const auto& extension = generateRel.advanced_extension();
std::vector<TypePtr> types;
if (!validateInputTypes(extension, types)) {
LOG_VALIDATION_MSG("Validation failed for input types in GenerateRel.");
return false;
}
int32_t inputPlanNodeId = 0;
// Create the fake input names to be used in row type.
std::vector<std::string> names;
names.reserve(types.size());
for (uint32_t colIdx = 0; colIdx < types.size(); colIdx++) {
names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx));
}
auto rowType = std::make_shared<RowType>(std::move(names), std::move(types));
if (generateRel.has_generator() && !validateExpression(generateRel.generator(), rowType)) {
LOG_VALIDATION_MSG("Input validation fails in GenerateRel.");
return false;
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::ExpandRel& expandRel) {
if (expandRel.has_input() && !validate(expandRel.input())) {
LOG_VALIDATION_MSG("Input validation fails in ExpandRel.");
return false;
}
RowTypePtr rowType = nullptr;
// Get and validate the input types from extension.
if (expandRel.has_advanced_extension()) {
const auto& extension = expandRel.advanced_extension();
std::vector<TypePtr> types;
if (!validateInputTypes(extension, types)) {
LOG_VALIDATION_MSG("Unsupported input types in ExpandRel.");
return false;
}
int32_t inputPlanNodeId = 0;
std::vector<std::string> names;
names.reserve(types.size());
for (auto colIdx = 0; colIdx < types.size(); colIdx++) {
names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx));
}
rowType = std::make_shared<RowType>(std::move(names), std::move(types));
}
int32_t projectSize = 0;
// Validate fields.
for (const auto& fields : expandRel.fields()) {
std::vector<core::TypedExprPtr> expressions;
if (fields.has_switching_field()) {
auto projectExprs = fields.switching_field().duplicates();
expressions.reserve(projectExprs.size());
if (projectSize == 0) {
projectSize = projectExprs.size();
} else if (projectSize != projectExprs.size()) {
LOG_VALIDATION_MSG("SwitchingField expressions size should be constant in ExpandRel.");
return false;
}
try {
for (const auto& projectExpr : projectExprs) {
const auto& typeCase = projectExpr.rex_type_case();
switch (typeCase) {
case ::substrait::Expression::RexTypeCase::kSelection:
case ::substrait::Expression::RexTypeCase::kLiteral:
break;
default:
LOG_VALIDATION_MSG("Only field or literal is supported in project of ExpandRel.");
return false;
}
if (rowType) {
expressions.emplace_back(exprConverter_->toVeloxExpr(projectExpr, rowType));
}
}
if (rowType) {
// Try to compile the expressions. If there is any unregistered
// function or mismatched type, exception will be thrown.
exec::ExprSet exprSet(std::move(expressions), execCtx_);
}
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
} else {
LOG_VALIDATION_MSG("Only SwitchingField is supported in ExpandRel.");
return false;
}
}
return true;
}
bool validateBoundType(::substrait::Expression_WindowFunction_Bound boundType) {
switch (boundType.kind_case()) {
case ::substrait::Expression_WindowFunction_Bound::kUnboundedFollowing:
case ::substrait::Expression_WindowFunction_Bound::kUnboundedPreceding:
case ::substrait::Expression_WindowFunction_Bound::kCurrentRow:
case ::substrait::Expression_WindowFunction_Bound::kFollowing:
case ::substrait::Expression_WindowFunction_Bound::kPreceding:
break;
default:
return false;
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::WindowRel& windowRel) {
if (windowRel.has_input() && !validate(windowRel.input())) {
LOG_VALIDATION_MSG("WindowRel input fails to validate.");
return false;
}
// Get and validate the input types from extension.
if (!windowRel.has_advanced_extension()) {
LOG_VALIDATION_MSG("Input types are expected in WindowRel.");
return false;
}
const auto& extension = windowRel.advanced_extension();
std::vector<TypePtr> types;
if (!validateInputTypes(extension, types)) {
LOG_VALIDATION_MSG("Validation failed for input types in WindowRel.");
return false;
}
int32_t inputPlanNodeId = 0;
std::vector<std::string> names;
names.reserve(types.size());
for (auto colIdx = 0; colIdx < types.size(); colIdx++) {
names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx));
}
auto rowType = std::make_shared<RowType>(std::move(names), std::move(types));
// Validate WindowFunction
std::vector<std::string> funcSpecs;
funcSpecs.reserve(windowRel.measures().size());
for (const auto& smea : windowRel.measures()) {
try {
const auto& windowFunction = smea.measure();
funcSpecs.emplace_back(planConverter_.findFuncSpec(windowFunction.function_reference()));
SubstraitParser::parseType(windowFunction.output_type());
for (const auto& arg : windowFunction.arguments()) {
auto typeCase = arg.value().rex_type_case();
switch (typeCase) {
case ::substrait::Expression::RexTypeCase::kSelection:
case ::substrait::Expression::RexTypeCase::kLiteral:
break;
default:
LOG_VALIDATION_MSG("Only field is supported in window functions.");
return false;
}
}
// Validate BoundType and Frame Type
switch (windowFunction.window_type()) {
case ::substrait::WindowType::ROWS:
case ::substrait::WindowType::RANGE:
break;
default:
LOG_VALIDATION_MSG(
"the window type only support ROWS and RANGE, and the input type is " +
std::to_string(windowFunction.window_type()));
return false;
}
bool boundTypeSupported =
validateBoundType(windowFunction.upper_bound()) && validateBoundType(windowFunction.lower_bound());
if (!boundTypeSupported) {
LOG_VALIDATION_MSG(
"Found unsupported Bound Type: upper " + std::to_string(windowFunction.upper_bound().kind_case()) +
", lower " + std::to_string(windowFunction.lower_bound().kind_case()));
return false;
}
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
}
// Validate supported aggregate functions.
static const std::unordered_set<std::string> unsupportedFuncs = {"collect_list", "collect_set"};
for (const auto& funcSpec : funcSpecs) {
auto funcName = SubstraitParser::getNameBeforeDelimiter(funcSpec);
if (unsupportedFuncs.find(funcName) != unsupportedFuncs.end()) {
LOG_VALIDATION_MSG(funcName + " was not supported in WindowRel.");
return false;
}
}
// Validate groupby expression
const auto& groupByExprs = windowRel.partition_expressions();
std::vector<core::TypedExprPtr> expressions;
expressions.reserve(groupByExprs.size());
try {
for (const auto& expr : groupByExprs) {
auto expression = exprConverter_->toVeloxExpr(expr, rowType);
auto expr_field = dynamic_cast<const core::FieldAccessTypedExpr*>(expression.get());
if (expr_field == nullptr) {
LOG_VALIDATION_MSG("Only field is supported for partition key in Window Operator!");
return false;
} else {
expressions.emplace_back(expression);
}
}
// Try to compile the expressions. If there is any unregistred funciton or
// mismatched type, exception will be thrown.
exec::ExprSet exprSet(std::move(expressions), execCtx_);
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
// Validate Sort expression
const auto& sorts = windowRel.sorts();
for (const auto& sort : sorts) {
switch (sort.direction()) {
case ::substrait::SortField_SortDirection_SORT_DIRECTION_ASC_NULLS_FIRST:
case ::substrait::SortField_SortDirection_SORT_DIRECTION_ASC_NULLS_LAST:
case ::substrait::SortField_SortDirection_SORT_DIRECTION_DESC_NULLS_FIRST:
case ::substrait::SortField_SortDirection_SORT_DIRECTION_DESC_NULLS_LAST:
break;
default:
LOG_VALIDATION_MSG("in windowRel, unsupported Sort direction " + std::to_string(sort.direction()));
return false;
}
if (sort.has_expr()) {
try {
auto expression = exprConverter_->toVeloxExpr(sort.expr(), rowType);
auto expr_field = dynamic_cast<const core::FieldAccessTypedExpr*>(expression.get());
if (!expr_field) {
LOG_VALIDATION_MSG("in windowRel, the sorting key in Sort Operator only support field.");
return false;
}
exec::ExprSet exprSet({std::move(expression)}, execCtx_);
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::SortRel& sortRel) {
if (sortRel.has_input() && !validate(sortRel.input())) {
return false;
}
// Get and validate the input types from extension.
if (!sortRel.has_advanced_extension()) {
LOG_VALIDATION_MSG("Input types are expected in SortRel.");
return false;
}
const auto& extension = sortRel.advanced_extension();
std::vector<TypePtr> types;
if (!validateInputTypes(extension, types)) {
LOG_VALIDATION_MSG("Validation failed for input types in SortRel.");
return false;
}
int32_t inputPlanNodeId = 0;
std::vector<std::string> names;
names.reserve(types.size());
for (auto colIdx = 0; colIdx < types.size(); colIdx++) {
names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx));
}
auto rowType = std::make_shared<RowType>(std::move(names), std::move(types));
const auto& sorts = sortRel.sorts();
for (const auto& sort : sorts) {
switch (sort.direction()) {
case ::substrait::SortField_SortDirection_SORT_DIRECTION_ASC_NULLS_FIRST:
case ::substrait::SortField_SortDirection_SORT_DIRECTION_ASC_NULLS_LAST:
case ::substrait::SortField_SortDirection_SORT_DIRECTION_DESC_NULLS_FIRST:
case ::substrait::SortField_SortDirection_SORT_DIRECTION_DESC_NULLS_LAST:
break;
default:
LOG_VALIDATION_MSG("unsupported Sort direction " + std::to_string(sort.direction()));
return false;
}
if (sort.has_expr()) {
try {
auto expression = exprConverter_->toVeloxExpr(sort.expr(), rowType);
auto expr_field = dynamic_cast<const core::FieldAccessTypedExpr*>(expression.get());
if (!expr_field) {
LOG_VALIDATION_MSG("in SortRel, the sorting key in Sort Operator only support field.");
return false;
}
exec::ExprSet exprSet({std::move(expression)}, execCtx_);
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::ProjectRel& projectRel) {
if (projectRel.has_input() && !validate(projectRel.input())) {
LOG_VALIDATION_MSG("ProjectRel input");
return false;
}
// Get and validate the input types from extension.
if (!projectRel.has_advanced_extension()) {
LOG_VALIDATION_MSG("Input types are expected in ProjectRel.");
return false;
}
const auto& extension = projectRel.advanced_extension();
std::vector<TypePtr> types;
if (!validateInputTypes(extension, types)) {
LOG_VALIDATION_MSG("Validation failed for input types in ProjectRel.");
return false;
}
int32_t inputPlanNodeId = 0;
// Create the fake input names to be used in row type.
std::vector<std::string> names;
names.reserve(types.size());
for (uint32_t colIdx = 0; colIdx < types.size(); colIdx++) {
names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx));
}
auto rowType = std::make_shared<RowType>(std::move(names), std::move(types));
// Validate the project expressions.
const auto& projectExprs = projectRel.expressions();
std::vector<core::TypedExprPtr> expressions;
expressions.reserve(projectExprs.size());
try {
for (const auto& expr : projectExprs) {
if (!validateExpression(expr, rowType)) {
return false;
}
expressions.emplace_back(exprConverter_->toVeloxExpr(expr, rowType));
}
// Try to compile the expressions. If there is any unregistered function or
// mismatched type, exception will be thrown.
exec::ExprSet exprSet(std::move(expressions), execCtx_);
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::FilterRel& filterRel) {
if (filterRel.has_input() && !validate(filterRel.input())) {
LOG_VALIDATION_MSG("input of FilterRel validation fails");
return false;
}
// Get and validate the input types from extension.
if (!filterRel.has_advanced_extension()) {
LOG_VALIDATION_MSG("Input types are expected in FilterRel.");
return false;
}
const auto& extension = filterRel.advanced_extension();
std::vector<TypePtr> types;
if (!validateInputTypes(extension, types)) {
LOG_VALIDATION_MSG("Validation failed for input types in FilterRel.");
return false;
}
int32_t inputPlanNodeId = 0;
// Create the fake input names to be used in row type.
std::vector<std::string> names;
names.reserve(types.size());
for (uint32_t colIdx = 0; colIdx < types.size(); colIdx++) {
names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx));
}
auto rowType = std::make_shared<RowType>(std::move(names), std::move(types));
std::vector<core::TypedExprPtr> expressions;
try {
if (!validateExpression(filterRel.condition(), rowType)) {
return false;
}
expressions.emplace_back(exprConverter_->toVeloxExpr(filterRel.condition(), rowType));
// Try to compile the expressions. If there is any unregistered function
// or mismatched type, exception will be thrown.
exec::ExprSet exprSet(std::move(expressions), execCtx_);
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::JoinRel& joinRel) {
if (joinRel.has_left() && !validate(joinRel.left())) {
LOG_VALIDATION_MSG("Validation fails for join left input.");
return false;
}
if (joinRel.has_right() && !validate(joinRel.right())) {
LOG_VALIDATION_MSG("Validation fails for join right input.");
return false;
}
if (joinRel.has_advanced_extension() &&
SubstraitParser::configSetInOptimization(joinRel.advanced_extension(), "isSMJ=")) {
switch (joinRel.type()) {
case ::substrait::JoinRel_JoinType_JOIN_TYPE_INNER:
case ::substrait::JoinRel_JoinType_JOIN_TYPE_LEFT:
break;
default:
LOG_VALIDATION_MSG("Sort merge join only support inner and left join.");
return false;
}
}
switch (joinRel.type()) {
case ::substrait::JoinRel_JoinType_JOIN_TYPE_INNER:
case ::substrait::JoinRel_JoinType_JOIN_TYPE_OUTER:
case ::substrait::JoinRel_JoinType_JOIN_TYPE_LEFT:
case ::substrait::JoinRel_JoinType_JOIN_TYPE_RIGHT:
case ::substrait::JoinRel_JoinType_JOIN_TYPE_LEFT_SEMI:
case ::substrait::JoinRel_JoinType_JOIN_TYPE_RIGHT_SEMI:
case ::substrait::JoinRel_JoinType_JOIN_TYPE_ANTI:
break;
default:
LOG_VALIDATION_MSG("Sort merge join only support inner and left join.");
return false;
}
// Validate input types.
if (!joinRel.has_advanced_extension()) {
LOG_VALIDATION_MSG("Input types are expected in JoinRel.");
return false;
}
const auto& extension = joinRel.advanced_extension();
std::vector<TypePtr> types;
if (!validateInputTypes(extension, types)) {
LOG_VALIDATION_MSG("Validation failed for input types in JoinRel.");
return false;
}
int32_t inputPlanNodeId = 0;
std::vector<std::string> names;
names.reserve(types.size());
for (auto colIdx = 0; colIdx < types.size(); colIdx++) {
names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx));
}
auto rowType = std::make_shared<RowType>(std::move(names), std::move(types));
if (joinRel.has_expression()) {
std::vector<const ::substrait::Expression::FieldReference*> leftExprs, rightExprs;
try {
planConverter_.extractJoinKeys(joinRel.expression(), leftExprs, rightExprs);
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
}
if (joinRel.has_post_join_filter()) {
try {
auto expression = exprConverter_->toVeloxExpr(joinRel.post_join_filter(), rowType);
exec::ExprSet exprSet({std::move(expression)}, execCtx_);
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::CrossRel& crossRel) {
if (crossRel.has_left() && !validate(crossRel.left())) {
logValidateMsg("native validation failed due to: validation fails for cross join left input. ");
return false;
}
if (crossRel.has_right() && !validate(crossRel.right())) {
logValidateMsg("native validation failed due to: validation fails for cross join right input. ");
return false;
}
// Validate input types.
if (!crossRel.has_advanced_extension()) {
logValidateMsg("native validation failed due to: Input types are expected in CrossRel.");
return false;
}
const auto& extension = crossRel.advanced_extension();
std::vector<TypePtr> types;
if (!validateInputTypes(extension, types)) {
logValidateMsg("native validation failed due to: Validation failed for input types in CrossRel");
return false;
}
int32_t inputPlanNodeId = 0;
std::vector<std::string> names;
names.reserve(types.size());
for (auto colIdx = 0; colIdx < types.size(); colIdx++) {
names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx));
}
auto rowType = std::make_shared<RowType>(std::move(names), std::move(types));
if (crossRel.has_expression()) {
try {
auto expression = exprConverter_->toVeloxExpr(crossRel.expression(), rowType);
exec::ExprSet exprSet({std::move(expression)}, execCtx_);
} catch (const VeloxException& err) {
logValidateMsg("native validation failed due to: crossRel expression validation fails, " + err.message());
return false;
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validateAggRelFunctionType(const ::substrait::AggregateRel& aggRel) {
if (aggRel.measures_size() == 0) {
return true;
}
for (const auto& smea : aggRel.measures()) {
const auto& aggFunction = smea.measure();
const auto& funcStep = planConverter_.toAggregationFunctionStep(aggFunction);
auto funcSpec = planConverter_.findFuncSpec(aggFunction.function_reference());
std::vector<TypePtr> types;
bool isDecimal = false;
try {
types = SubstraitParser::sigToTypes(funcSpec);
for (const auto& type : types) {
if (!isDecimal && type->isDecimal()) {
isDecimal = true;
}
}
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
auto baseFuncName =
SubstraitParser::mapToVeloxFunction(SubstraitParser::getNameBeforeDelimiter(funcSpec), isDecimal);
auto funcName = planConverter_.toAggregationFunctionName(baseFuncName, funcStep);
auto signaturesOpt = exec::getAggregateFunctionSignatures(funcName);
if (!signaturesOpt) {
LOG_VALIDATION_MSG("can not find function signature for" + funcName + " in AggregateRel.");
return false;
}
bool resolved = false;
for (const auto& signature : signaturesOpt.value()) {
exec::SignatureBinder binder(*signature, types);
if (binder.tryBind()) {
auto resolveType = binder.tryResolveType(
exec::isPartialOutput(funcStep) ? signature->intermediateType() : signature->returnType());
if (resolveType == nullptr) {
LOG_VALIDATION_MSG("Validation failed for function " + funcName + " resolve type in AggregateRel.");
return false;
}
resolved = true;
break;
}
}
if (!resolved) {
LOG_VALIDATION_MSG("Validation failed for function " + funcName + " bind signatures in AggregateRel.");
return false;
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::AggregateRel& aggRel) {
if (aggRel.has_input() && !validate(aggRel.input())) {
LOG_VALIDATION_MSG("Input validation fails in AggregateRel.");
return false;
}
// Validate input types.
if (aggRel.has_advanced_extension()) {
std::vector<TypePtr> types;
const auto& extension = aggRel.advanced_extension();
// Aggregate always has advanced extension for streaming aggregate optimization,
// but only some of them have enhancement for validation.
if (extension.has_enhancement() && !validateInputTypes(extension, types)) {
LOG_VALIDATION_MSG("Validation failed for input types in AggregateRel.");
return false;
}
}
// Validate groupings.
for (const auto& grouping : aggRel.groupings()) {
for (const auto& groupingExpr : grouping.grouping_expressions()) {
const auto& typeCase = groupingExpr.rex_type_case();
switch (typeCase) {
case ::substrait::Expression::RexTypeCase::kSelection:
break;
default:
LOG_VALIDATION_MSG("Only field is supported in groupings.");
return false;
}
}
}
// Validate aggregate functions.
std::vector<std::string> funcSpecs;
funcSpecs.reserve(aggRel.measures().size());
for (const auto& smea : aggRel.measures()) {
try {
// Validate the filter expression
if (smea.has_filter()) {
::substrait::Expression aggRelMask = smea.filter();
if (aggRelMask.ByteSizeLong() > 0) {
auto typeCase = aggRelMask.rex_type_case();
switch (typeCase) {
case ::substrait::Expression::RexTypeCase::kSelection:
break;
default:
LOG_VALIDATION_MSG("Only field is supported in aggregate filter expression.");
return false;
}
}
}
const auto& aggFunction = smea.measure();
const auto& functionSpec = planConverter_.findFuncSpec(aggFunction.function_reference());
funcSpecs.emplace_back(functionSpec);
SubstraitParser::parseType(aggFunction.output_type());
// Validate the size of arguments.
if (SubstraitParser::getNameBeforeDelimiter(functionSpec) == "count" && aggFunction.arguments().size() > 1) {
LOG_VALIDATION_MSG("Count should have only one argument.");
// Count accepts only one argument.
return false;
}
for (const auto& arg : aggFunction.arguments()) {
auto typeCase = arg.value().rex_type_case();
switch (typeCase) {
case ::substrait::Expression::RexTypeCase::kSelection:
case ::substrait::Expression::RexTypeCase::kLiteral:
break;
default:
LOG_VALIDATION_MSG("Only field is supported in aggregate functions.");
return false;
}
}
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
}
// The supported aggregation functions. TODO: Remove this set when Presto aggregate functions in Velox are not
// needed to be registered.
static const std::unordered_set<std::string> supportedAggFuncs = {
"sum",
"collect_set",
"count",
"avg",
"min",
"max",
"min_by",
"max_by",
"stddev_samp",
"stddev_pop",
"bloom_filter_agg",
"var_samp",
"var_pop",
"bit_and",
"bit_or",
"bit_xor",
"first",
"first_ignore_null",
"last",
"last_ignore_null",
"corr",
"covar_pop",
"covar_samp",
"approx_distinct"};
for (const auto& funcSpec : funcSpecs) {
auto funcName = SubstraitParser::getNameBeforeDelimiter(funcSpec);
if (supportedAggFuncs.find(funcName) == supportedAggFuncs.end()) {
LOG_VALIDATION_MSG(funcName + " was not supported in AggregateRel.");
return false;
}
}
if (!validateAggRelFunctionType(aggRel)) {
return false;
}
// Validate both groupby and aggregates input are empty, which is corner case.
if (aggRel.measures_size() == 0) {
bool hasExpr = false;
for (const auto& grouping : aggRel.groupings()) {
if (grouping.grouping_expressions().size() > 0) {
hasExpr = true;
break;
}
}
if (!hasExpr) {
LOG_VALIDATION_MSG("Aggregation must specify either grouping keys or aggregates.");
return false;
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::ReadRel& readRel) {
try {
planConverter_.toVeloxPlan(readRel);
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
// Validate filter in ReadRel.
if (readRel.has_filter()) {
std::vector<TypePtr> veloxTypeList;
if (readRel.has_base_schema()) {
const auto& baseSchema = readRel.base_schema();
veloxTypeList = SubstraitParser::parseNamedStruct(baseSchema);
}
int32_t inputPlanNodeId = 0;
std::vector<std::string> names;
names.reserve(veloxTypeList.size());
for (auto colIdx = 0; colIdx < veloxTypeList.size(); colIdx++) {
names.emplace_back(SubstraitParser::makeNodeName(inputPlanNodeId, colIdx));
}
auto rowType = std::make_shared<RowType>(std::move(names), std::move(veloxTypeList));
std::vector<core::TypedExprPtr> expressions;
try {
if (!validateExpression(readRel.filter(), rowType)) {
return false;
}
expressions.emplace_back(exprConverter_->toVeloxExpr(readRel.filter(), rowType));
// Try to compile the expressions. If there is any unregistered function
// or mismatched type, exception will be thrown.
exec::ExprSet exprSet(std::move(expressions), execCtx_);
} catch (const VeloxException& err) {
LOG_VALIDATION_MSG_FROM_EXCEPTION(err);
return false;
}
}
return true;
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::Rel& rel) {
if (rel.has_aggregate()) {
return validate(rel.aggregate());
} else if (rel.has_project()) {
return validate(rel.project());
} else if (rel.has_filter()) {
return validate(rel.filter());
} else if (rel.has_join()) {
return validate(rel.join());
} else if (rel.has_cross()) {
return validate(rel.cross());
} else if (rel.has_read()) {
return validate(rel.read());
} else if (rel.has_sort()) {
return validate(rel.sort());
} else if (rel.has_expand()) {
return validate(rel.expand());
} else if (rel.has_generate()) {
return validate(rel.generate());
} else if (rel.has_fetch()) {
return validate(rel.fetch());
} else if (rel.has_window()) {
return validate(rel.window());
} else if (rel.has_write()) {
return validate(rel.write());
} else {
return false;
}
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::RelRoot& relRoot) {
if (relRoot.has_input()) {
return validate(relRoot.input());
} else {
return false;
}
}
bool SubstraitToVeloxPlanValidator::validate(const ::substrait::Plan& plan) {
// Create plan converter and expression converter to help the validation.
planConverter_.constructFunctionMap(plan);
exprConverter_ = planConverter_.getExprConverter();
for (const auto& rel : plan.relations()) {
if (rel.has_root()) {
return validate(rel.root());
} else if (rel.has_rel()) {
return validate(rel.rel());
}
}
return false;
}
} // namespace gluten