backend/schema/updater/sql_expression_validators.cc (367 lines of code) (raw):

// // Copyright 2020 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include <memory> #include <string> #include <vector> #include "zetasql/public/analyzer.h" #include "zetasql/public/analyzer_options.h" #include "zetasql/public/catalog.h" #include "zetasql/public/function_signature.h" #include "zetasql/public/language_options.h" #include "zetasql/public/simple_catalog.h" #include "zetasql/public/types/type.h" #include "zetasql/public/types/type_factory.h" #include "zetasql/resolved_ast/resolved_ast.h" #include "zetasql/resolved_ast/resolved_node.h" #include "zetasql/resolved_ast/resolved_node_kind.pb.h" #include "absl/container/flat_hash_set.h" #include "absl/memory/memory.h" #include "absl/status/status.h" #include "absl/strings/string_view.h" #include "absl/strings/substitute.h" #include "backend/query/analyzer_options.h" #include "backend/query/catalog.h" #include "backend/query/function_catalog.h" #include "backend/query/query_context.h" #include "backend/query/query_validator.h" #include "backend/query/queryable_column.h" #include "backend/query/queryable_table.h" #include "backend/query/queryable_view.h" #include "backend/schema/catalog/schema.h" #include "backend/schema/catalog/table.h" #include "backend/schema/catalog/udf.h" #include "backend/schema/catalog/view.h" #include "backend/schema/graph/schema_node.h" #include "common/errors.h" #include "common/limits.h" #include "zetasql/base/ret_check.h" #include "zetasql/base/status_macros.h" namespace google { namespace spanner { namespace emulator { namespace backend { namespace { Udf::Determinism AnalyzedVolatilityToDeterminism( zetasql::FunctionEnums::Volatility volatility) { switch (volatility) { case zetasql::FunctionEnums::IMMUTABLE: return Udf::Determinism::DETERMINISTIC; case zetasql::FunctionEnums::STABLE: return Udf::Determinism::NOT_DETERMINISTIC_STABLE; case zetasql::FunctionEnums::VOLATILE: return Udf::Determinism::NOT_DETERMINISTIC_VOLATILE; default: return Udf::Determinism::DETERMINISM_UNSPECIFIED; }; } Udf::Determinism ReduceToLeastDeterministic(Udf::Determinism determinism_1, Udf::Determinism determinism_2) { if (determinism_2 == Udf::Determinism::NOT_DETERMINISTIC_VOLATILE) { return determinism_2; } if (determinism_2 == Udf::Determinism::NOT_DETERMINISTIC_STABLE && (determinism_1 == Udf::Determinism::NOT_DETERMINISTIC_STABLE || determinism_1 == Udf::Determinism::DETERMINISTIC || determinism_1 == Udf::Determinism::DETERMINISM_UNSPECIFIED)) { return determinism_2; } if (determinism_2 == Udf::Determinism::DETERMINISTIC) { if (determinism_1 == Udf::Determinism::DETERMINISTIC || determinism_1 == Udf::Determinism::DETERMINISM_UNSPECIFIED) { return determinism_2; } }; return determinism_1; } } // namespace // A validator that checks column expressions for valid SQL. class ColumnExpressionValidator : public QueryValidator { public: ColumnExpressionValidator( const Schema* schema, const zetasql::Table* table, absl::string_view expression_use, absl::flat_hash_set<std::string>* dependent_column_names, bool allow_volatile_expression, absl::flat_hash_set<const SchemaNode*>* udf_dependencies) : QueryValidator(QueryContext{.schema = schema, .allow_read_write_only_functions = true}, /*options=*/nullptr), table_(table), expression_use_(expression_use), dependent_column_names_(dependent_column_names), allow_volatile_expression_(allow_volatile_expression), udf_dependencies_(udf_dependencies) {} absl::Status DefaultVisit(const zetasql::ResolvedNode* node) override { if (node->IsScan() || node->node_kind() == zetasql::RESOLVED_SUBQUERY_EXPR) { return error::NonScalarExpressionInColumnExpression(expression_use_); } if (node->node_kind() == zetasql::RESOLVED_EXPRESSION_COLUMN) { std::string column_name = node->GetAs<zetasql::ResolvedExpressionColumn>()->name(); const zetasql::Column* column = table_->FindColumnByName(column_name); ZETASQL_RET_CHECK_NE(column, nullptr); dependent_column_names_->insert(column->Name()); } return QueryValidator::DefaultVisit(node); } protected: absl::Status VisitResolvedFunctionCall( const zetasql::ResolvedFunctionCall* node) override { // The validation order matters here. // Need to invoke the parent visitor first since some higher level // validation should precede the deterministic function check. For example, // using pending_commit_timestamp() in generated column at CREATE TABLE // should return error due to that function only being allowed in INSERT or // UPDATE. ZETASQL_RETURN_IF_ERROR(QueryValidator::VisitResolvedFunctionCall(node)); const Udf* udf = schema()->FindUdf(node->function()->FullName(false)); if (udf != nullptr) { // The schema object UDF is transitive across its own dependencies. if (udf->determinism_level() != Udf::Determinism::DETERMINISTIC && !allow_volatile_expression_) { return error::NonDeterministicFunctionInColumnExpression( udf->Name(), expression_use_); } udf_dependencies_->insert(udf); } else { if (node->function()->function_options().volatility != zetasql::FunctionEnums::IMMUTABLE && !allow_volatile_expression_) { return error::NonDeterministicFunctionInColumnExpression( node->function()->SQLName(), expression_use_); } } return absl::OkStatus(); } private: const zetasql::Table* table_; absl::string_view expression_use_; absl::flat_hash_set<std::string>* dependent_column_names_; bool allow_volatile_expression_; absl::flat_hash_set<const SchemaNode*>* udf_dependencies_; }; // A validator that checks view definitions for valid SQL. class ViewDefinitionValidator : public QueryValidator { public: // The dependencies returned in `dependencies` are not transitive. i.e. they // are only the direct dependencies of the view definition being validated. ViewDefinitionValidator(const Schema* schema, const zetasql::LanguageOptions& language_options, absl::flat_hash_set<const SchemaNode*>* dependencies) : QueryValidator({.schema = schema}, /*extracted_options=*/nullptr, /*language_options=*/language_options), dependencies_(dependencies) {} private: absl::Status VisitResolvedWithScan( const zetasql::ResolvedWithScan* node) override { return error::WithViewsAreNotSupported(); } absl::Status VisitResolvedTableScan( const zetasql::ResolvedTableScan* scan) override { // Visit the entire tree for the scan first, validating it and collecting // any references to indexes. Collect the references after the view query // has been determined to be valid. ZETASQL_RETURN_IF_ERROR(QueryValidator::VisitResolvedTableScan(scan)); // The 'catalog table' referenced in the resolved AST could be a table or a // view. auto catalog_table = scan->table(); if (catalog_table->Is<backend::QueryableTable>()) { dependencies_->insert( catalog_table->GetAs<backend::QueryableTable>()->wrapped_table()); } else if (catalog_table->Is<backend::QueryableView>()) { dependencies_->insert( catalog_table->GetAs<backend::QueryableView>()->wrapped_view()); } else { // This should not happen. A view referencing a non-existent dependency // should fail analaysis. ZETASQL_RET_CHECK_FAIL() << "Dependency not found: " << catalog_table->Name(); } // Add the column dependencies for the view. // We analyze the view with prune_unused_columns=true. This should result // in the resolved scan containing only the columns that are referenced in // the view. const auto& used_columns = scan->column_index_list(); for (auto column_index : used_columns) { auto catalog_column = catalog_table->GetColumn(column_index); ZETASQL_RET_CHECK_NE(catalog_column, nullptr) << "Referenced column " << scan->column_list()[column_index].DebugString() << " not found in " << catalog_table->Name(); if (catalog_column->Is<backend::QueryableColumn>()) { dependencies_->insert(catalog_column->GetAs<backend::QueryableColumn>() ->wrapped_column()); } } // Also add any indexes used as dependencies for (const auto* index : indexes_used()) { ZETASQL_RET_CHECK_NE(index, nullptr); dependencies_->insert(index); } return absl::OkStatus(); } absl::Status VisitResolvedFunctionCall( const zetasql::ResolvedFunctionCall* node) override { ZETASQL_RETURN_IF_ERROR(QueryValidator::VisitResolvedFunctionCall(node)); const Udf* udf = schema()->FindUdf(node->function()->FullName(/*include_group=*/false)); if (udf != nullptr) { dependencies_->insert(udf); } return absl::OkStatus(); } private: absl::flat_hash_set<const SchemaNode*>* dependencies_; }; // A validator that checks udf definitions for valid SQL. class UdfDefinitionValidator : public QueryValidator { public: // The dependencies returned in `dependencies` are not transitive. i.e. they // are only the direct dependencies of the view definition being validated. UdfDefinitionValidator(const Schema* schema, const zetasql::LanguageOptions& language_options, absl::flat_hash_set<const SchemaNode*>* dependencies, Udf::Determinism* determinism_level) : QueryValidator({.schema = schema}, /*extracted_options=*/nullptr, /*language_options=*/language_options), dependencies_(dependencies), determinism_level_(determinism_level) {} private: absl::Status VisitResolvedWithScan( const zetasql::ResolvedWithScan* node) override { return error::WithViewsAreNotSupported(); } absl::Status VisitResolvedTableScan( const zetasql::ResolvedTableScan* scan) override { // Visit the entire tree for the scan first, validating it and collecting // any references to indexes. Collect the references after the udf query // has been determined to be valid. ZETASQL_RETURN_IF_ERROR(QueryValidator::VisitResolvedTableScan(scan)); // The 'catalog table' referenced in the resolved AST could be a table or a // view. auto catalog_table = scan->table(); if (catalog_table->Is<backend::QueryableTable>()) { dependencies_->insert( catalog_table->GetAs<backend::QueryableTable>()->wrapped_table()); } else if (catalog_table->Is<backend::QueryableView>()) { dependencies_->insert( catalog_table->GetAs<backend::QueryableView>()->wrapped_view()); } else { // This should not happen. A udf referencing a non-existent dependency // should fail analaysis. ZETASQL_RET_CHECK_FAIL() << "Dependency not found: " << catalog_table->Name(); } // Add the column dependencies for the udf. // We analyze the udf with prune_unused_columns=true. This should result // in the resolved scan containing only the columns that are referenced in // the udf. const auto& used_columns = scan->column_index_list(); for (auto column_index : used_columns) { auto catalog_column = catalog_table->GetColumn(column_index); ZETASQL_RET_CHECK_NE(catalog_column, nullptr) << "Referenced column " << scan->column_list()[column_index].DebugString() << " not found in " << catalog_table->Name(); if (catalog_column->Is<backend::QueryableColumn>()) { dependencies_->insert(catalog_column->GetAs<backend::QueryableColumn>() ->wrapped_column()); } } // Also add any indexes used as dependencies for (const auto* index : indexes_used()) { ZETASQL_RET_CHECK_NE(index, nullptr); dependencies_->insert(index); } return absl::OkStatus(); } protected: absl::Status VisitResolvedFunctionCall( const zetasql::ResolvedFunctionCall* node) override { ZETASQL_RETURN_IF_ERROR(QueryValidator::VisitResolvedFunctionCall(node)); // ZETASQL_VLOG IF THIS UDF IS ALWAYS THE SAME AS THE NODE ONE const Udf* udf = schema()->FindUdf(node->function()->FullName(false)); if (udf != nullptr) { *determinism_level_ = ReduceToLeastDeterministic( *determinism_level_, udf->determinism_level()); dependencies_->insert(udf); } else { *determinism_level_ = ReduceToLeastDeterministic( *determinism_level_, AnalyzedVolatilityToDeterminism( node->function()->function_options().volatility)); } return absl::OkStatus(); } private: absl::flat_hash_set<const SchemaNode*>* dependencies_; Udf::Determinism* determinism_level_; }; absl::Status AnalyzeColumnExpression( absl::string_view expression, const zetasql::Type* target_type, const Table* table, const Schema* schema, zetasql::TypeFactory* type_factory, const std::vector<zetasql::SimpleTable::NameAndType>& name_and_types, absl::string_view expression_use, absl::flat_hash_set<std::string>* dependent_column_names, absl::flat_hash_set<const SchemaNode*>* dependent_sequences, bool allow_volatile_expression, absl::flat_hash_set<const SchemaNode*>* udf_dependencies) { zetasql::SimpleTable simple_table(table->Name(), name_and_types); zetasql::AnalyzerOptions options = MakeGoogleSqlAnalyzerOptions(schema->default_time_zone()); // ZetaSQL rewriting could rewrite scalar expressions into subquery. // Disable all default enabled rewriting to check the original shape of // user provided expression and ensure forward compatibility. auto enabled_rewrites = options.enabled_rewrites(); for (auto enabled_rewrite : enabled_rewrites) { options.enable_rewrite(enabled_rewrite, false); } for (const auto& name_and_type : name_and_types) { ZETASQL_RETURN_IF_ERROR( options.AddExpressionColumn(name_and_type.first, name_and_type.second)); } std::unique_ptr<const zetasql::AnalyzerOutput> output; FunctionCatalog function_catalog(type_factory); Catalog catalog(schema, &function_catalog, type_factory, MakeGoogleSqlAnalyzerOptions(schema->default_time_zone())); ZETASQL_RETURN_IF_ERROR(zetasql::AnalyzeExpressionForAssignmentToType( expression, options, &catalog, type_factory, target_type, &output)); ColumnExpressionValidator validator( schema, &simple_table, expression_use, dependent_column_names, allow_volatile_expression, udf_dependencies); ZETASQL_RETURN_IF_ERROR(output->resolved_expr()->Accept(&validator)); if (output->resolved_expr()->GetTreeDepth() > limits::kColumnExpressionMaxDepth) { return error::ColumnExpressionMaxDepthExceeded( output->resolved_expr()->GetTreeDepth(), limits::kColumnExpressionMaxDepth); } if (dependent_sequences != nullptr && !validator.dependent_sequences().empty()) { *dependent_sequences = validator.dependent_sequences(); } return absl::OkStatus(); } absl::Status AnalyzeViewDefinition( absl::string_view view_name, absl::string_view view_definition, const Schema* schema, zetasql::TypeFactory* type_factory, std::vector<View::Column>* output_columns, absl::flat_hash_set<const SchemaNode*>* dependencies) { auto body = absl::Substitute("CREATE VIEW `$0` SQL SECURITY INVOKER AS $1", view_name, view_definition); // Analyze the view definition. auto analyzer_options = MakeGoogleSqlAnalyzerOptionsForViewsAndFunctions( schema->default_time_zone(), schema->dialect()); analyzer_options.set_prune_unused_columns(true); FunctionCatalog function_catalog( type_factory, kCloudSpannerEmulatorFunctionCatalogName, schema); Catalog catalog(schema, &function_catalog, type_factory, analyzer_options); std::unique_ptr<const zetasql::AnalyzerOutput> analyzer_output; ZETASQL_RETURN_IF_ERROR(zetasql::AnalyzeStatement(body, analyzer_options, &catalog, type_factory, &analyzer_output)); // Check the view definition for only allowed elements. const zetasql::ResolvedCreateViewStmt* create_view_stmt = analyzer_output->resolved_statement() ->GetAs<zetasql::ResolvedCreateViewStmt>(); ViewDefinitionValidator validator(schema, analyzer_options.language(), dependencies); ZETASQL_RETURN_IF_ERROR(create_view_stmt->query()->Accept(&validator)); for (const auto& c : create_view_stmt->output_column_list()) { output_columns->emplace_back(View::Column{c->name(), c->column().type()}); } for (const SchemaNode* sequence : validator.dependent_sequences()) { dependencies->insert(sequence); } return absl::OkStatus(); } absl::Status AnalyzeUdfDefinition( absl::string_view udf_name, absl::string_view param_list, absl::string_view udf_definition, const Schema* schema, zetasql::TypeFactory* type_factory, absl::flat_hash_set<const SchemaNode*>* dependencies, std::unique_ptr<zetasql::FunctionSignature>* function_signature, Udf::Determinism* determinism_level) { auto body = absl::Substitute("CREATE FUNCTION `$0`($1) SQL SECURITY INVOKER AS ($2)", udf_name, param_list, udf_definition); // Analyze the udf definition. auto analyzer_options = MakeGoogleSqlAnalyzerOptionsForViewsAndFunctions( schema->default_time_zone(), schema->dialect()); analyzer_options.set_prune_unused_columns(true); FunctionCatalog function_catalog( type_factory, kCloudSpannerEmulatorFunctionCatalogName, schema); Catalog catalog(schema, &function_catalog, type_factory, analyzer_options); std::unique_ptr<const zetasql::AnalyzerOutput> analyzer_output; ZETASQL_RETURN_IF_ERROR(zetasql::AnalyzeStatement(body, analyzer_options, &catalog, type_factory, &analyzer_output)); // Check the udf definition for only allowed elements. const zetasql::ResolvedCreateFunctionStmt* create_function_stmt = analyzer_output->resolved_statement() ->GetAs<zetasql::ResolvedCreateFunctionStmt>(); UdfDefinitionValidator validator(schema, analyzer_options.language(), dependencies, determinism_level); ZETASQL_RETURN_IF_ERROR( create_function_stmt->function_expression()->Accept(&validator)); for (const SchemaNode* sequence : validator.dependent_sequences()) { dependencies->insert(sequence); } *function_signature = absl::make_unique<zetasql::FunctionSignature>( create_function_stmt->signature()); return absl::OkStatus(); } } // namespace backend } // namespace emulator } // namespace spanner } // namespace google