sql/sql_union.cc

/* Copyright (c) 2001, 2017, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* Process query expressions that are composed of 1. UNION of query blocks, and/or 2. have ORDER BY / LIMIT clauses in more than one level. An example of 2) is: (SELECT * FROM t1 ORDER BY a LIMIT 10) ORDER BY b LIMIT 5 UNION's were introduced by Monty and Sinisa <sinisa@mysql.com> */ #include "sql_union.h" #include "my_config.h" #include <string.h> #include <sys/types.h> #include "auth_acls.h" #include "current_thd.h" #include "debug_sync.h" // DEBUG_SYNC #include "error_handler.h" // Strict_error_handler #include "field.h" #include "filesort.h" // filesort_free_buffers #include "handler.h" #include "item.h" #include "item_subselect.h" #include "my_base.h" #include "my_dbug.h" #include "my_macros.h" #include "my_sys.h" #include "mysql_com.h" #include "mysqld_error.h" #include "opt_explain.h" // explain_no_table #include "opt_explain_format.h" #include "parse_tree_node_base.h" #include "query_options.h" #include "sql_base.h" // fill_record #include "sql_class.h" #include "sql_executor.h" #include "sql_lex.h" #include "window.h" // Window #include "sql_list.h" #include "sql_optimizer.h" // JOIN #include "sql_select.h" #include "sql_tmp_table.h" // tmp tables bool Query_result_union::prepare(List<Item>&, SELECT_LEX_UNIT *u) { unit= u; return false; } bool Query_result_union::send_data(List<Item> &values) { // Skip "offset" number of rows before producing rows if (unit->offset_limit_cnt > 0) { unit->offset_limit_cnt--; return false; } if (fill_record(thd, table, table->visible_field_ptr(), values, NULL, NULL)) return true; /* purecov: inspected */ if (!check_unique_constraint(table)) return false; const int error= table->file->ha_write_row(table->record[0]); if (!error) { m_rows_in_table++; return false; } // create_ondisk_from_heap will generate error if needed if (!table->file->is_ignorable_error(error)) { bool is_duplicate; if (create_ondisk_from_heap(thd, table, tmp_table_param.start_recinfo, &tmp_table_param.recinfo, error, true, &is_duplicate)) return true; /* purecov: inspected */ // Table's engine changed, index is not initialized anymore if (table->hash_field) table->file->ha_index_init(0, false); if (!is_duplicate) m_rows_in_table++; } return false; } bool Query_result_union::send_eof() { return false; } bool Query_result_union::flush() { const int error= table->file->extra(HA_EXTRA_NO_CACHE); if (error) { table->file->print_error(error, MYF(0)); /* purecov: inspected */ return true; /* purecov: inspected */ } return false; } /** Create a temporary table to store the result of Query_result_union. @param thd_arg thread handle @param column_types a list of items used to define columns of the temporary table @param is_union_distinct if set, the temporary table will eliminate duplicates on insert @param options create options @param table_alias name of the temporary table @param bit_fields_as_long convert bit fields to ulonglong @param create_table If false, a table handler will not be created when creating the result table. @details Create a temporary table that is used to store the result of a UNION, derived table, or a materialized cursor. @returns false if table created, true if error */ bool Query_result_union::create_result_table(THD *thd_arg, List<Item> *column_types, bool is_union_distinct, ulonglong options, const char *table_alias, bool bit_fields_as_long, bool create_table) { DBUG_ASSERT(table == NULL); tmp_table_param= Temp_table_param(); count_field_types(thd_arg->lex->current_select(), &tmp_table_param, *column_types, false, true); tmp_table_param.skip_create_table= !create_table; tmp_table_param.bit_fields_as_long= bit_fields_as_long; if (unit != nullptr) { if (unit->is_recursive()) { /* If the UNIQUE key specified for UNION DISTINCT were a primary key in InnoDB, rows would be returned by the scan in an order depending on their columns' values, not in insertion order. */ tmp_table_param.can_use_pk_for_unique= false; tmp_table_param.allow_scan_from_position= true; } if (unit->mixed_union_operators()) { /* Generally, UNIQUE key can be promoted to PK, saving the space consumption of a hidden PK. However, if the query mixes UNION ALL and UNION DISTINCT, the PK will be disabled at some point in execution, which InnoDB doesn't support as it uses a clustered PK. Then, no PK: */ tmp_table_param.can_use_pk_for_unique= false; } } if (! (table= create_tmp_table(thd_arg, &tmp_table_param, *column_types, NULL, is_union_distinct, true, options, HA_POS_ERROR, (char*) table_alias, TMP_WIN_UNCONDITIONAL))) return true; if (create_table) { table->file->extra(HA_EXTRA_WRITE_CACHE); table->file->extra(HA_EXTRA_IGNORE_DUP_KEY); if (table->hash_field) table->file->ha_index_init(0, 0); } return false; } /** Reset and empty the temporary table that stores the materialized query result. @note The cleanup performed here is exactly the same as for the two temp tables of JOIN - exec_tmp_table_[1 | 2]. */ void Query_result_union::cleanup() { if (table == NULL) return; table->file->extra(HA_EXTRA_RESET_STATE); if (table->hash_field) table->file->ha_index_or_rnd_end(); table->file->ha_delete_all_rows(); free_io_cache(table); filesort_free_buffers(table,0); } /** UNION result that is passed directly to the receiving Query_result without filling a temporary table. Function calls are forwarded to the wrapped Query_result, but some functions are expected to be called only once for each query, so they are only executed for the first query block in the union (except for send_eof(), which is executed only for the last query block). This Query_result is used when a UNION is not DISTINCT and doesn't have a global ORDER BY clause. @see SELECT_LEX_UNIT::prepare(). */ class Query_result_union_direct final : public Query_result_union { private: /// Result object that receives all rows Query_result *result; /// The last query block of the union SELECT_LEX *last_select_lex; /// Wrapped result is optimized bool optimized; /// Wrapped result has sent metadata bool result_set_metadata_sent; /// Wrapped result has started execution bool execution_started; /// Accumulated current_found_rows ulonglong current_found_rows; /// Number of rows offset ha_rows offset; /// Number of rows limit + offset, @see Query_result_union_direct::send_data() ha_rows limit; public: Query_result_union_direct(THD *thd, Query_result *result, SELECT_LEX *last_select_lex) :Query_result_union(thd), result(result), last_select_lex(last_select_lex), optimized(false), result_set_metadata_sent(false), execution_started(false), current_found_rows(0) {} bool change_query_result(Query_result *new_result) override; uint field_count(List<Item> &) const override { // Only called for top-level Query_results, usually Query_result_send DBUG_ASSERT(false); /* purecov: inspected */ return 0; /* purecov: inspected */ } bool postponed_prepare(List<Item> &types) override; bool send_result_set_metadata(List<Item> &list, uint flags) override; bool send_data(List<Item> &items) override; bool optimize() override { if (optimized) return false; optimized= true; return result->optimize(); } bool start_execution() override { if (execution_started) return false; execution_started= true; return result->start_execution(); } void send_error(uint errcode, const char *err) override { result->send_error(errcode, err); /* purecov: inspected */ } bool send_eof() override; bool flush() override { return false; } bool check_simple_select() const override { // Only called for top-level Query_results, usually Query_result_send DBUG_ASSERT(false); /* purecov: inspected */ return false; /* purecov: inspected */ } void abort_result_set() override { result->abort_result_set(); /* purecov: inspected */ } void cleanup() override {} void set_thd(THD*) { /* Only called for top-level Query_results, usually Query_result_send, and for the results of subquery engines (select_<something>_subselect). */ DBUG_ASSERT(false); /* purecov: inspected */ } }; /** Replace the current query result with new_result and prepare it. @param new_result New query result @returns false if success, true if error */ bool Query_result_union_direct::change_query_result(Query_result *new_result) { result= new_result; return result->prepare(unit->types, unit); } bool Query_result_union_direct::postponed_prepare(List<Item> &types) { if (result == NULL) return false; return result->prepare(types, unit); } bool Query_result_union_direct::send_result_set_metadata(List<Item>&, uint flags) { if (result_set_metadata_sent) return false; result_set_metadata_sent= true; /* Set global offset and limit to be used in send_data(). These can be variables in prepared statements or stored programs, so they must be reevaluated for each execution. */ offset= unit->global_parameters()->get_offset(); limit= unit->global_parameters()->get_limit(); if (limit + offset >= limit) limit+= offset; else limit= HA_POS_ERROR; /* purecov: inspected */ return result->send_result_set_metadata(unit->types, flags); } bool Query_result_union_direct::send_data(List<Item> &items) { if (limit == 0) return false; limit--; if (offset) { offset--; return false; } if (fill_record(thd, table, table->field, items, NULL, NULL)) return true; /* purecov: inspected */ return result->send_data(unit->item_list); } bool Query_result_union_direct::send_eof() { /* Accumulate the found_rows count for the current query block into the UNION. Number of rows returned from a query block is always non-negative. */ ulonglong offset= thd->lex->current_select()->get_offset(); current_found_rows+= thd->current_found_rows > offset ? thd->current_found_rows - offset : 0; if (unit->thd->lex->current_select() == last_select_lex) { /* If SQL_CALC_FOUND_ROWS is not enabled, adjust the current_found_rows according to the global limit and offset defined. */ if (!(unit->first_select()->active_options() & OPTION_FOUND_ROWS)) { ha_rows global_limit= unit->global_parameters()->get_limit(); ha_rows global_offset= unit->global_parameters()->get_offset(); if (global_limit != HA_POS_ERROR) { if (global_offset != HA_POS_ERROR) global_limit+= global_offset; if (current_found_rows > global_limit) current_found_rows= global_limit; } } thd->current_found_rows= current_found_rows; // Reset and make ready for re-execution // @todo: Dangerous if we have an error midway? result_set_metadata_sent= false; optimized= false; execution_started= false; return result->send_eof(); } else return false; } /// RAII class to automate saving/restoring of current_select() class Change_current_select { public: Change_current_select(THD *thd_arg) : thd(thd_arg), saved_select(thd->lex->current_select()) {} void restore() { thd->lex->set_current_select(saved_select); } ~Change_current_select() { restore(); } private: THD *thd; SELECT_LEX *saved_select; }; /** Prepare the fake_select_lex query block @param thd_arg Thread handler @returns false if success, true if error */ bool SELECT_LEX_UNIT::prepare_fake_select_lex(THD *thd_arg) { DBUG_ENTER("SELECT_LEX_UNIT::prepare_fake_select_lex"); DBUG_ASSERT(thd_arg->lex->current_select() == fake_select_lex); // The UNION result table is input table for this query block fake_select_lex->table_list.link_in_list(&result_table_list, &result_table_list.next_local); result_table_list.select_lex= fake_select_lex; // Set up the result table for name resolution fake_select_lex->context.table_list= fake_select_lex->context.first_name_resolution_table= fake_select_lex->get_table_list(); if (!fake_select_lex->first_execution) { for (ORDER *order= fake_select_lex->order_list.first; order; order= order->next) order->item= &order->item_ptr; } for (ORDER *order= fake_select_lex->order_list.first; order; order=order->next) { (*order->item)->walk(&Item::change_context_processor, Item::WALK_POSTFIX, (uchar*) &fake_select_lex->context); } fake_select_lex->set_query_result(query_result()); fake_select_lex->fields_list= item_list; /* We need to add up n_sum_items in order to make the correct allocation in setup_ref_array(). Don't add more sum_items if we have already done SELECT_LEX::prepare for this (with a different join object) */ if (fake_select_lex->base_ref_items.is_null()) fake_select_lex->n_child_sum_items+= fake_select_lex->n_sum_items; DBUG_ASSERT(fake_select_lex->with_wild == 0 && fake_select_lex->master_unit() == this && !fake_select_lex->group_list.elements && fake_select_lex->where_cond() == NULL && fake_select_lex->having_cond() == NULL); if (is_recursive()) { /* The fake_select_lex's JOIN is going to read result_table_list repeatedly, so this table has all the attributes of a recursive reference: */ result_table_list.set_recursive_reference(); } if (fake_select_lex->prepare(thd_arg)) DBUG_RETURN(true); DBUG_RETURN(false); } /** Prepares all query blocks of a query expression, including fake_select_lex. If a recursive query expression, this also creates the materialized temporary table. @param thd_arg Thread handler @param sel_result Result object where the unit's output should go. @param added_options These options will be added to the query blocks. @param removed_options Options that cannot be used for this query @returns false if success, true if error */ bool SELECT_LEX_UNIT::prepare(THD *thd_arg, Query_result *sel_result, ulonglong added_options, ulonglong removed_options) { DBUG_ENTER("SELECT_LEX_UNIT::prepare"); DBUG_ASSERT(!is_prepared()); Change_current_select save_select(thd); Query_result *tmp_result; bool instantiate_tmp_table= false; SELECT_LEX *last_select= first_select(); while (last_select->next_select()) last_select= last_select->next_select(); set_query_result(sel_result); thd_arg->lex->set_current_select(first_select()); // Save fake_select_lex in case we don't need it for anything but // global parameters. if (saved_fake_select_lex == NULL && // Don't overwrite on PS second prepare fake_select_lex != NULL) { thd->lock_query_plan(); saved_fake_select_lex= fake_select_lex; thd->unlock_query_plan(); } const bool simple_query_expression= is_simple(); // Create query result object for use by underlying query blocks if (!simple_query_expression) { if (is_union() && !union_needs_tmp_table()) { if (!(tmp_result= union_result= new (*THR_MALLOC) Query_result_union_direct(thd, sel_result, last_select))) goto err; /* purecov: inspected */ if (fake_select_lex != NULL) { thd->lock_query_plan(); fake_select_lex= NULL; thd->unlock_query_plan(); } instantiate_tmp_table= false; } else { if (!(tmp_result= union_result= new (*THR_MALLOC) Query_result_union(thd))) goto err; /* purecov: inspected */ instantiate_tmp_table= true; } } else { // Only one query block, and no "fake" object: No extra result needed: tmp_result= sel_result; } if (fake_select_lex != NULL) { /* There exists a query block that consolidates the UNION result. Prepare the active options for this query block. If these options contain OPTION_BUFFER_RESULT, the query block will perform a buffering operation, which means that an underlying query block does not need to buffer its result, and the buffer option for the underlying query blocks can be cleared. For subqueries in form "a IN (SELECT .. UNION SELECT ..): when optimizing the fake_select_lex that reads the results of the union from a temporary table, do not mark the temp. table as constant because the contents in it may vary from one subquery execution to another, by adding OPTION_NO_CONST_TABLES. */ fake_select_lex->make_active_options( (added_options & (OPTION_FOUND_ROWS | OPTION_BUFFER_RESULT)) | OPTION_NO_CONST_TABLES | SELECT_NO_UNLOCK, 0); added_options&= ~OPTION_BUFFER_RESULT; } first_select()->context.resolve_in_select_list= true; for (SELECT_LEX *sl= first_select(); sl; sl= sl->next_select()) { // All query blocks get their options in this phase sl->set_query_result(tmp_result); sl->make_active_options(added_options | SELECT_NO_UNLOCK, removed_options); sl->fields_list= sl->item_list; /* setup_tables_done_option should be set only for very first SELECT, because it protect from second setup_tables call for select-like non select commands (DELETE/INSERT/...) and they use only very first SELECT (for union it can be only INSERT ... SELECT). */ added_options&= ~OPTION_SETUP_TABLES_DONE; thd_arg->lex->set_current_select(sl); if (sl == first_recursive) { // create_result_table() depends on current_select() save_select.restore(); /* All next query blocks will read the temporary table, which we must thus create now: */ if (derived_table->setup_materialized_derived_tmp_table(thd_arg)) goto err; /* purecov: inspected */ thd_arg->lex->set_current_select(sl); } if (sl->recursive_reference) // Make tmp table known to query block: derived_table->common_table_expr()-> substitute_recursive_reference(thd_arg, sl); if (sl->prepare(thd_arg)) goto err; /* Use items list of underlaid select for derived tables to preserve information about fields lengths and exact types */ if (!is_union()) types= first_select()->item_list; else if (sl == first_select()) { types.empty(); List_iterator_fast<Item> it(sl->item_list); Item *item_tmp; while ((item_tmp= it++)) { /* If the outer query has a GROUP BY clause, an outer reference to this query block may have been wrapped in a Item_outer_ref, which has not been fixed yet. An Item_type_holder must be created based on a fixed Item, so use the inner Item instead. */ DBUG_ASSERT(item_tmp->fixed || (item_tmp->type() == Item::REF_ITEM && down_cast<Item_ref *>(item_tmp)->ref_type() == Item_ref::OUTER_REF)); if (!item_tmp->fixed) item_tmp= item_tmp->real_item(); auto holder= new Item_type_holder(thd_arg, item_tmp); if (!holder) goto err; /* purecov: inspected */ if (is_recursive()) { holder->maybe_null= true; // Always nullable, per SQL standard. /* The UNION code relies on join_types() to change some transitional types like MYSQL_TYPE_DATETIME2 into other types; in case this is the only nonrecursive query block join_types() won't be called so we need an explicit call: */ holder->join_types(thd_arg, item_tmp); } types.push_back(holder); } } else { if (types.elements != sl->item_list.elements) { my_error(ER_WRONG_NUMBER_OF_COLUMNS_IN_SELECT, MYF(0)); goto err; } if (sl->recursive_reference) { /* Recursive query blocks don't determine output types of the result. The only thing to check could be if the recursive query block has a type which can't be cast to the output type of the result. But in MySQL, all types can be cast to each other (at least during resolution; an error may reported when trying to actually insert, for example an INT into a POINT). So no further compatibility check is needed here. */ } else { List_iterator_fast<Item> it(sl->item_list); List_iterator_fast<Item> tp(types); Item *type, *item_tmp; while ((type= tp++, item_tmp= it++)) { if (((Item_type_holder*)type)->join_types(thd_arg, item_tmp)) goto err; } } } if (sl->recursive_reference && (sl->is_grouped() || sl->m_windows.elements > 0)) { // Per SQL2011. my_error(ER_CTE_RECURSIVE_FORBIDS_AGGREGATION, MYF(0), derived_table->alias); goto err; } } if (is_recursive()) { // This had to wait until all query blocks are prepared: if (check_materialized_derived_query_blocks(thd_arg)) goto err; /* purecov: inspected */ } /* If the query is using Query_result_union_direct, we have postponed preparation of the underlying Query_result until column types are known. */ if (union_result != NULL && union_result->postponed_prepare(types)) goto err; if (!simple_query_expression) { /* Check that it was possible to aggregate all collations together for UNION. We need this in case of UNION DISTINCT, to filter out duplicates using the proper collation. TODO: consider removing this test in case of UNION ALL. */ List_iterator_fast<Item> tp(types); Item *type; while ((type= tp++)) { if (type->result_type() == STRING_RESULT && type->collation.derivation == DERIVATION_NONE) { my_error(ER_CANT_AGGREGATE_NCOLLATIONS, MYF(0), "UNION"); goto err; } } ulonglong create_options= first_select()->active_options() | TMP_TABLE_ALL_COLUMNS; /* Force the temporary table to be a MyISAM table if we're going to use fulltext functions (MATCH ... AGAINST .. IN BOOLEAN MODE) when reading from it (this should be removed when fulltext search is moved out of MyISAM). */ if (fake_select_lex && fake_select_lex->ftfunc_list->elements) create_options|= TMP_TABLE_FORCE_MYISAM; if (union_result->create_result_table(thd, &types, union_distinct != nullptr, create_options, "", false, instantiate_tmp_table)) goto err; memset(&result_table_list, 0, sizeof(result_table_list)); result_table_list.db= (char*) ""; result_table_list.table_name= result_table_list.alias= (char*) "union"; result_table_list.table= table= union_result->table; table->pos_in_table_list= &result_table_list; result_table_list.select_lex= fake_select_lex ? fake_select_lex : saved_fake_select_lex; result_table_list.set_tableno(0); result_table_list.set_privileges(SELECT_ACL); if (!item_list.elements) { Prepared_stmt_arena_holder ps_arena_holder(thd); if (table->fill_item_list(&item_list)) goto err; /* purecov: inspected */ } else { /* We're in execution of a prepared statement or stored procedure: reset field items to point at fields from the created temporary table. */ table->reset_item_list(&item_list); } if (fake_select_lex != NULL) { thd_arg->lex->set_current_select(fake_select_lex); if (prepare_fake_select_lex(thd_arg)) goto err; } } // Query blocks are prepared, update the state set_prepared(); DBUG_RETURN(false); err: (void) cleanup(false); DBUG_RETURN(true); } /** Optimize all query blocks of a query expression, including fake_select_lex @param thd thread handler @returns false if optimization successful, true if error */ bool SELECT_LEX_UNIT::optimize(THD *thd) { DBUG_ENTER("SELECT_LEX_UNIT::optimize"); DBUG_ASSERT(is_prepared() && !is_optimized()); Change_current_select save_select(thd); for (SELECT_LEX *sl= first_select(); sl; sl= sl->next_select()) { thd->lex->set_current_select(sl); // LIMIT is required for optimization if (set_limit(thd, sl)) DBUG_RETURN(true); /* purecov: inspected */ if (sl->optimize(thd)) DBUG_RETURN(true); /* Accumulate estimated number of rows. 1. Implicitly grouped query has one row (with HAVING it has zero or one rows). 2. If GROUP BY clause is optimized away because it was a constant then query produces at most one row. */ if (query_result()) query_result()->estimated_rowcount+= sl->is_implicitly_grouped() || sl->join->group_optimized_away ? 1 : sl->join->best_rowcount; } if (fake_select_lex) { thd->lex->set_current_select(fake_select_lex); if (set_limit(thd, fake_select_lex)) DBUG_RETURN(true); /* purecov: inspected */ /* In EXPLAIN command, constant subqueries that do not use any tables are executed two times: - 1st time is a real evaluation to get the subquery value - 2nd time is to produce EXPLAIN output rows. 1st execution sets certain members (e.g. Query_result) to perform subquery execution rather than EXPLAIN line production. In order to reset them back, we re-do all of the actions (yes it is ugly). */ DBUG_ASSERT(fake_select_lex->with_wild == 0 && fake_select_lex->master_unit() == this && !fake_select_lex->group_list.elements && fake_select_lex->get_table_list() == &result_table_list && fake_select_lex->where_cond() == NULL && fake_select_lex->having_cond() == NULL); if (fake_select_lex->optimize(thd)) DBUG_RETURN(true); } set_optimized(); // All query blocks optimized, update the state DBUG_RETURN(false); } /** Explain query starting from this unit. @param ethd THD of explaining thread @return false if success, true if error */ bool SELECT_LEX_UNIT::explain(THD *ethd) { DBUG_ENTER("SELECT_LEX_UNIT::explain"); #ifndef DBUG_OFF SELECT_LEX *lex_select_save= thd->lex->current_select(); #endif Explain_format *fmt= ethd->lex->explain_format; const bool other= (thd != ethd); bool ret= false; if (!other) { DBUG_ASSERT(!is_simple() && is_optimized()); set_executed(); } if (fmt->begin_context(CTX_UNION)) DBUG_RETURN(true); for (SELECT_LEX *sl= first_select(); sl; sl= sl->next_select()) { if (fmt->begin_context(CTX_QUERY_SPEC)) DBUG_RETURN(true); if (explain_query_specification(ethd, sl, CTX_JOIN) || fmt->end_context(CTX_QUERY_SPEC)) DBUG_RETURN(true); } if (fake_select_lex != NULL) { // Don't save result as it's needed only for consequent exec. ret= explain_query_specification(ethd, fake_select_lex, CTX_UNION_RESULT); } if (!other) DBUG_ASSERT(thd->lex->current_select() == lex_select_save); if (ret) DBUG_RETURN(true); fmt->end_context(CTX_UNION); DBUG_RETURN(false); } /** Helper class for SELECT_LEX_UNIT::execute(). Manages executions of non-recursive and recursive query blocks (if any). There are two possible flows of data rows for recursive CTEs: 1) Assuming QB1 UNION ALL QBR2 UNION ALL QBR3, where QBRi are recursive, we have a single tmp table (the derived table's result): QB1 appends rows to tmp table. Label eval_recursive_members: QBR2 reads tmp table and appends new rows to it; it also reads its own new rows, etc (loop back) until no more rows. QBR3 same. If rows have been inserted since we passed the label, go to the label. 2) Assuming QB1 UNION DISTINCT QBR2 UNION DISTINCT QBR3, where QBRi are recursive, we have two tmp tables (the union-distinct's result UR and, at the external layer, the derived table's result DR; UR has a unique index); FAKE is the fake_select_lex: QB1 appends rows to UR. FAKE reads from UR and appends to DR. Label eval_recursive_members: QBR2 reads DR and appends new rows to UR; thus it does not read its own new rows as they are not in DR yet. QBR3 same. FAKE reads from UR and appends to DR. If rows have been inserted into DR since we passed the label, go to the label. In both flows, sub_select() is used to read the recursive reference with a table scan. It reads until there are no more rows, which could be simply implemented by reading until the storage engine reports EOF, but is not. The reason is that storage engines (MEMORY, InnoDB) have behaviour at EOF which isn't compatible with the requirement to catch up with new rows: 1) In both engines, when they report EOF, the scan stays blocked at EOF forever even if rows are later inserted. In detail, this happens because heap_scan() unconditionally increments info->current_record, and because InnoDB has a supremum record. 2) Specifically for the MEMORY engine: the UNION DISTINCT table of a recursive CTE receives interlaced writes (which can hit a duplicate key) and reads. A read cursor is corrupted by a write if there is a duplicate key error. Scenario: - write 'A' - write 'A': allocates a record, hits a duplicate key error, leaves the allocated place as "deleted record". - init scan - read: finds 'A' at #0 - read: finds deleted record at #1, properly skips over it, moves to EOF - even if we save the read position at this point, it's "after #1" - close scan - write 'B': takes the place of deleted record, i.e. writes at #1 - write 'C': writes at #2 - init scan, reposition at saved position - read: still after #1, so misses 'B'. In this scenario, the table is formed of real records followed by deleted records and then EOF. 3) To avoid those problems, sub_select() stops reading when it has read the count of real records in the table, thus engines never hit EOF or a deleted record. */ class Recursive_executor { private: SELECT_LEX_UNIT *const unit; THD *const thd; /// Count of executions of recursive members. uint iteration_counter; Strict_error_handler strict_handler; enum_check_fields save_check_for_truncated_fields; sql_mode_t save_sql_mode; bool disabled_trace, pop_handler; /** If recursive, count of rows in the temporary table when the current iteration started. */ ha_rows row_count; TABLE *table; ///< Table for result of union handler *cached_file; ///< 'handler' of 'table' /// Space to store a row position (InnoDB uses 6 bytes, MEMORY uses 16) uchar row_ref[16]; public: Recursive_executor(SELECT_LEX_UNIT *unit_arg, THD *thd_arg) : unit(unit_arg), thd(thd_arg), iteration_counter(0), strict_handler(Strict_error_handler::ENABLE_SET_SELECT_STRICT_ERROR_HANDLER), disabled_trace(false), pop_handler(false), row_count(0), table(nullptr), cached_file(nullptr) { TRASH(row_ref, sizeof(row_ref)); } bool initialize(TABLE *table_arg) { if (!unit->is_recursive()) return false; /* For RECURSIVE, beginners will forget that: - the CTE's column types are defined by the non-recursive member - which implies that recursive member's selected expressions are cast to the non-recursive member's type. That will cause silent truncation and possibly an infinite recursion due to a condition like: 'LENGTH(growing_col) < const', or, 'growing_col < const', which is always satisfied due to truncation. This situation is similar to create table t select "x" as a; insert into t select concat("x",a) from t; which sends ER_DATA_TOO_LONG in strict mode. So we should inform the user. If we only raised warnings: it will not interrupt an infinite recursion, a MAX_RECURSION hint (if we featured one) may interrupt; but then the warnings won't be seen, as the interruption will raise an error. So warnings are useless. Instead, we send a truncation error: it is visible, indicates the source of the problem, and is consistent with the INSERT case above. Usually, truncation in SELECT triggers an error only in strict mode; but if we don't send an error we get a runaway query; and as WITH RECURSIVE is a new feature we don't have to carry the permissiveness of the past, so we send an error even if in non-strict mode. For a non-recursive UNION, truncation shouldn't happen as all UNION members participated in type calculation. */ if (thd->is_strict_mode()) { pop_handler= true; save_check_for_truncated_fields= thd->check_for_truncated_fields; thd->check_for_truncated_fields= CHECK_FIELD_WARN; thd->push_internal_handler(&strict_handler); } for (SELECT_LEX *sl= unit->first_recursive; sl; sl= sl->next_select()) { TABLE_LIST *tl= sl->recursive_reference; DBUG_ASSERT(tl && tl->table && // returns rows in insertion order: tl->table->s->primary_key == MAX_KEY); if (open_tmp_table(tl->table)) return true; /* purecov: inspected */ } unit->got_all_recursive_rows= false; table= table_arg; return false; } /// @returns Query block to execute first, in current phase SELECT_LEX *first_select() const { return (iteration_counter == 0) ? unit->first_select() : unit->first_recursive; } /// @returns Query block to execute last, in current phase SELECT_LEX *last_select() const { return (iteration_counter == 0) ? unit->first_recursive : nullptr; } /// @returns true if more iterations are needed bool more_iterations() { if (!unit->is_recursive()) return false; iteration_counter++; if (iteration_counter == 3) { DEBUG_SYNC(thd, "in_WITH_RECURSIVE"); } ha_rows new_row_count= *unit->query_result()->row_count(); if (row_count == new_row_count) { // nothing new if (unit->got_all_recursive_rows) return false; // The final iteration is done. unit->got_all_recursive_rows= true; // Do a final iteration, just to get table free-ing/unlocking: return true; } row_count= new_row_count; #ifdef OPTIMIZER_TRACE Opt_trace_context &trace= thd->opt_trace; if (iteration_counter == 2 && !trace.feature_enabled(Opt_trace_context::REPEATED_SUBSELECT)) { disabled_trace= true; trace.disable_I_S_for_this_and_children(); } #endif return true; } /** fake_select_lex is going to read rows which appeared since the previous pass. So it needs to re-establish the scan where it had left. */ bool prepare_for_scan() { if (cached_file == nullptr) return false; DBUG_ASSERT(iteration_counter > 0); int error; if (cached_file == table->file) { DBUG_ASSERT(!cached_file->inited); error= cached_file->ha_rnd_init(false); DBUG_ASSERT(!error); error= cached_file->ha_rnd_pos(table->record[0], row_ref); } else { // Since last pass of reads, MEMORY changed to InnoDB: QEP_TAB *qep_tab= table->reginfo.qep_tab; error= reposition_innodb_cursor(table, qep_tab->m_fetched_rows); } DBUG_ASSERT(!error); return error; } /** After fake_select_lex has done a pass of reading 'table', control will soon go to recursive query blocks which may write to 'table', thus we save the read-cursor's position (necessary to re-establish the scan at next pass), then close the cursor (necessary to allow writes). A tidy approach like this is necessary: with a single 'handler', an open read-cursor cannot survive writes (example: in MEMORY, read-cursor position is 'info->current_ptr' (see heap_scan()) and heap_write() changes it). */ bool save_scan_position() { if (!unit->is_recursive()) return false; if (!table->file->inited) { // Scan is not initialized if seed SELECT returned empty result cached_file= nullptr; return false; } cached_file= table->file; cached_file->position(table->record[0]); DBUG_ASSERT(sizeof(row_ref) >= cached_file->ref_length); memcpy(row_ref, cached_file->ref, cached_file->ref_length); int error= cached_file->ha_rnd_end(); DBUG_ASSERT(!error); return error; } ~Recursive_executor() { if (unit->is_recursive()) { #ifdef OPTIMIZER_TRACE if (disabled_trace) thd->opt_trace.restore_I_S(); #endif if (pop_handler) { thd->pop_internal_handler(); thd->check_for_truncated_fields= save_check_for_truncated_fields; } } } }; /** Execute a query expression that may be a UNION and/or have an ordered result. @param thd thread handle @returns false if success, true if error */ bool SELECT_LEX_UNIT::execute(THD *thd) { DBUG_ENTER("SELECT_LEX_UNIT::exec"); DBUG_ASSERT(!is_simple() && is_optimized()); if (is_executed() && !uncacheable) DBUG_RETURN(false); /* Even if we return "true" the statement might continue (e.g. ER_SUBQUERY_1_ROW in stmt with IGNORE), so we want to restore current_select(): */ Change_current_select save_select(thd); if (is_executed()) { for (SELECT_LEX *sl= first_select(); sl; sl= sl->next_select()) { if (sl->join->is_executed()) { thd->lex->set_current_select(sl); sl->join->reset(); } if (fake_select_lex != nullptr) { thd->lex->set_current_select(fake_select_lex); fake_select_lex->join->reset(); } } } // Set "executed" state, even though execution may end with an error set_executed(); if (item) { item->reset_value_registration(); if (item->assigned()) { item->assigned(false); // Prepare for re-execution of this unit item->reset(); if (table->is_created()) { table->file->ha_delete_all_rows(); table->file->info(HA_STATUS_VARIABLE); } } // re-enable indexes for next subquery execution if (union_distinct && table->file->ha_enable_indexes(HA_KEY_SWITCH_ALL)) DBUG_RETURN(true); /* purecov: inspected */ } Recursive_executor recursive_executor(this, thd); if (recursive_executor.initialize(table)) DBUG_RETURN(true); /* purecov: inspected */ bool status= false; // Execution error status do { for (auto sl= recursive_executor.first_select(); sl != recursive_executor.last_select(); sl= sl->next_select()) { thd->lex->set_current_select(sl); // Set limit and offset for each execution: if (set_limit(thd, sl)) DBUG_RETURN(true); /* purecov: inspected */ // Execute this query block sl->join->exec(); status= sl->join->error != 0; if (sl == union_distinct && sl->next_select()) { // This is UNION DISTINCT, so there should be a fake_select_lex DBUG_ASSERT(fake_select_lex != NULL); if (table->file->ha_disable_indexes(HA_KEY_SWITCH_ALL)) DBUG_RETURN(true); /* purecov: inspected */ table->no_keyread= 1; } if (status) DBUG_RETURN(true); if (union_result->flush()) DBUG_RETURN(true); /* purecov: inspected */ } if (fake_select_lex != NULL) { thd->lex->set_current_select(fake_select_lex); if (table->hash_field) // Prepare for access method of JOIN::exec table->file->ha_index_or_rnd_end(); if (set_limit(thd, fake_select_lex)) DBUG_RETURN(true); /* purecov: inspected */ JOIN *join= fake_select_lex->join; if (recursive_executor.prepare_for_scan()) DBUG_RETURN(true); /* purecov: inspected */ join->exec(); status= join->error != 0; if (status) DBUG_RETURN(true); if (recursive_executor.save_scan_position()) DBUG_RETURN(true); /* purecov: inspected */ if (table->hash_field) // Prepare for duplicate elimination table->file->ha_index_init(0, false); } } while (recursive_executor.more_iterations()); if (fake_select_lex) { fake_select_lex->table_list.empty(); int error= table->file->info(HA_STATUS_VARIABLE); if (error) { table->file->print_error(error, MYF(0)); /* purecov: inspected */ DBUG_RETURN(true); /* purecov: inspected */ } thd->current_found_rows= (ulonglong)table->file->stats.records; } DBUG_RETURN(status); } /** Cleanup this query expression object after preparation or one round of execution. After the cleanup, the object can be reused for a new round of execution, but a new optimization will be needed before the execution. @return false if previous execution was successful, and true otherwise */ bool SELECT_LEX_UNIT::cleanup(bool full) { DBUG_ENTER("SELECT_LEX_UNIT::cleanup"); DBUG_ASSERT(thd == current_thd); if (cleaned >= (full ? UC_CLEAN : UC_PART_CLEAN)) DBUG_RETURN(false); cleaned= (full ? UC_CLEAN : UC_PART_CLEAN); bool error= false; for (SELECT_LEX *sl= first_select(); sl; sl= sl->next_select()) error|= sl->cleanup(full); if (fake_select_lex) { /* Normally done at end of evaluation, but not if there was an error: */ fake_select_lex->table_list.empty(); fake_select_lex->recursive_reference= nullptr; error|= fake_select_lex->cleanup(full); } // fake_select_lex's table depends on Temp_table_param inside union_result if (full && union_result) { union_result->cleanup(); delete union_result; union_result= NULL; // Safety if (table) free_tmp_table(thd, table); table= NULL; // Safety } /* explain_marker is (mostly) a property determined at prepare time and must thus be preserved for the next execution, if this is a prepared statement. */ DBUG_RETURN(error); } #ifndef DBUG_OFF void SELECT_LEX_UNIT::assert_not_fully_clean() { DBUG_ASSERT(cleaned < UC_CLEAN); SELECT_LEX *sl= first_select(); for (;;) { if (!sl) { sl= fake_select_lex; if (!sl) break; } for (SELECT_LEX_UNIT *lex_unit= sl->first_inner_unit(); lex_unit ; lex_unit= lex_unit->next_unit()) lex_unit->assert_not_fully_clean(); if (sl == fake_select_lex) break; else sl= sl->next_select(); } } #endif void SELECT_LEX_UNIT::reinit_exec_mechanism() { prepared= optimized= executed= false; #ifndef DBUG_OFF if (is_union()) { List_iterator_fast<Item> it(item_list); Item *field; while ((field= it++)) { /* we can't cleanup here, because it broke link to temporary table field, but have to drop fixed flag to allow next fix_field of this field during re-executing */ field->fixed= 0; } } #endif } /** Change the query result object used to return the final result of the unit, replacing occurences of old_result with new_result. @param new_result New query result object @param old_result Old query result object @retval false Success @retval true Error */ bool SELECT_LEX_UNIT::change_query_result(Query_result_interceptor *new_result, Query_result_interceptor *old_result) { for (SELECT_LEX *sl= first_select(); sl; sl= sl->next_select()) { if (sl->query_result() && sl->change_query_result(new_result, old_result)) return true; /* purecov: inspected */ } set_query_result(new_result); return false; } /** Get column type information for this query expression. For a single query block the column types are taken from the list of selected items of this block. For a union this function assumes that SELECT_LEX_UNIT::prepare() has been called and returns the type holders that were created for unioned column types of all query blocks. @note The implementation of this function should be in sync with SELECT_LEX_UNIT::prepare() @returns List of items as specified in function description */ List<Item> *SELECT_LEX_UNIT::get_unit_column_types() { DBUG_ASSERT(is_prepared()); return is_union() ? &types : &first_select()->item_list; } /** Get field list for this query expression. For a UNION of query blocks, return the field list generated during prepare. For a single query block, return the field list after all possible intermediate query processing steps are done (optimization is complete). @returns List containing fields of the query expression. */ List<Item> *SELECT_LEX_UNIT::get_field_list() { DBUG_ASSERT(is_optimized()); return is_union() ? &types : first_select()->join->fields; } const Query_result *SELECT_LEX_UNIT::recursive_result(SELECT_LEX *reader) const { DBUG_ASSERT(reader->master_unit() == this && reader->is_recursive()); if (reader == fake_select_lex) return union_result; else return m_query_result; } bool SELECT_LEX_UNIT::mixed_union_operators() const { return union_distinct && union_distinct->next_select(); } /** Closes (and, if last reference, drops) temporary tables created to materialize derived tables, schema tables and CTEs. @param thd Thread handler @param list List of tables to search in */ static void destroy_materialized(THD *thd, TABLE_LIST *list) { for (auto tl= list; tl; tl= tl->next_local) { if (tl->merge_underlying_list) { // Find a materialized view inside another view. destroy_materialized(thd, tl->merge_underlying_list); } if (!tl->table) continue; // Not materialized if (tl->is_view_or_derived()) { tl->reset_name_temporary(); if (tl->common_table_expr()) tl->common_table_expr()->tmp_tables.clear(); } else if (!tl->is_recursive_reference() && !tl->schema_table) continue; free_tmp_table(thd, tl->table); } } /** Cleanup after preparation or one round of execution. @return false if previous execution was successful, and true otherwise */ bool SELECT_LEX::cleanup(bool full) { DBUG_ENTER("SELECT_LEX::cleanup()"); bool error= false; if (join) { if (full) { DBUG_ASSERT(join->select_lex == this); error= join->destroy(); delete join; join= NULL; } else join->cleanup(); } THD *const thd= master_unit()->thd; if (full) destroy_materialized(thd, get_table_list()); for (SELECT_LEX_UNIT *lex_unit= first_inner_unit(); lex_unit ; lex_unit= lex_unit->next_unit()) { error|= lex_unit->cleanup(full); } inner_refs_list.empty(); if (full && m_windows.elements > 0) { List_iterator<Window> li(m_windows); Window *w; while ((w= li++)) w->cleanup(thd); } DBUG_RETURN(error); } void SELECT_LEX::cleanup_all_joins() { if (join) join->cleanup(); for (SELECT_LEX_UNIT *unit= first_inner_unit(); unit; unit= unit->next_unit()) { for (SELECT_LEX *sl= unit->first_select(); sl; sl= sl->next_select()) sl->cleanup_all_joins(); } }

sql/sql_union.cc (983 lines of code) (raw):