in storage/innobase/row/row0sel.cc [1327:2043]
static MY_ATTRIBUTE((nonnull, warn_unused_result))
dberr_t
row_sel(
/*====*/
sel_node_t* node, /*!< in: select node */
que_thr_t* thr) /*!< in: query thread */
{
dict_index_t* index;
plan_t* plan;
mtr_t mtr;
ibool moved;
rec_t* rec;
rec_t* old_vers;
rec_t* clust_rec;
ibool search_latch_locked;
ibool consistent_read;
/* The following flag becomes TRUE when we are doing a
consistent read from a non-clustered index and we must look
at the clustered index to find out the previous delete mark
state of the non-clustered record: */
ibool cons_read_requires_clust_rec = FALSE;
ulint cost_counter = 0;
ibool cursor_just_opened;
ibool must_go_to_next;
ibool mtr_has_extra_clust_latch = FALSE;
/* TRUE if the search was made using
a non-clustered index, and we had to
access the clustered record: now &mtr
contains a clustered index latch, and
&mtr must be committed before we move
to the next non-clustered record */
ulint found_flag;
dberr_t err;
mem_heap_t* heap = NULL;
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint* offsets = offsets_;
rec_offs_init(offsets_);
ut_ad(thr->run_node == node);
search_latch_locked = FALSE;
if (node->read_view) {
/* In consistent reads, we try to do with the hash index and
not to use the buffer page get. This is to reduce memory bus
load resulting from semaphore operations. The search latch
will be s-locked when we access an index with a unique search
condition, but not locked when we access an index with a
less selective search condition. */
consistent_read = TRUE;
} else {
consistent_read = FALSE;
}
table_loop:
/* TABLE LOOP
----------
This is the outer major loop in calculating a join. We come here when
node->fetch_table changes, and after adding a row to aggregate totals
and, of course, when this function is called. */
ut_ad(mtr_has_extra_clust_latch == FALSE);
plan = sel_node_get_nth_plan(node, node->fetch_table);
index = plan->index;
if (plan->n_rows_prefetched > 0) {
sel_dequeue_prefetched_row(plan);
goto next_table_no_mtr;
}
if (plan->cursor_at_end) {
/* The cursor has already reached the result set end: no more
rows to process for this table cursor, as also the prefetch
stack was empty */
ut_ad(plan->pcur_is_open);
goto table_exhausted_no_mtr;
}
/* Open a cursor to index, or restore an open cursor position */
mtr_start_trx(&mtr, thr_get_trx(thr));
if (consistent_read && plan->unique_search && !plan->pcur_is_open
&& !plan->must_get_clust
&& !plan->table->big_rows) {
if (!search_latch_locked) {
rw_lock_s_lock(&btr_search_latch);
search_latch_locked = TRUE;
} else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
/* There is an x-latch request waiting: release the
s-latch for a moment; as an s-latch here is often
kept for some 10 searches before being released,
a waiting x-latch request would block other threads
from acquiring an s-latch for a long time, lowering
performance significantly in multiprocessors. */
rw_lock_s_unlock(&btr_search_latch);
rw_lock_s_lock(&btr_search_latch);
}
found_flag = row_sel_try_search_shortcut(node, plan,
search_latch_locked,
&mtr);
if (found_flag == SEL_FOUND) {
goto next_table;
} else if (found_flag == SEL_EXHAUSTED) {
goto table_exhausted;
}
ut_ad(found_flag == SEL_RETRY);
plan_reset_cursor(plan);
mtr_commit(&mtr);
mtr_start_trx(&mtr, thr_get_trx(thr));
}
if (search_latch_locked) {
rw_lock_s_unlock(&btr_search_latch);
search_latch_locked = FALSE;
}
if (!plan->pcur_is_open) {
/* Evaluate the expressions to build the search tuple and
open the cursor */
row_sel_open_pcur(plan, search_latch_locked, &mtr);
cursor_just_opened = TRUE;
/* A new search was made: increment the cost counter */
cost_counter++;
} else {
/* Restore pcur position to the index */
must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
cursor_just_opened = FALSE;
if (must_go_to_next) {
/* We have already processed the cursor record: move
to the next */
goto next_rec;
}
}
rec_loop:
/* RECORD LOOP
-----------
In this loop we use pcur and try to fetch a qualifying row, and
also fill the prefetch buffer for this table if n_rows_fetched has
exceeded a threshold. While we are inside this loop, the following
holds:
(1) &mtr is started,
(2) pcur is positioned and open.
NOTE that if cursor_just_opened is TRUE here, it means that we came
to this point right after row_sel_open_pcur. */
ut_ad(mtr_has_extra_clust_latch == FALSE);
rec = btr_pcur_get_rec(&(plan->pcur));
/* PHASE 1: Set a lock if specified */
if (!node->asc && cursor_just_opened
&& !page_rec_is_supremum(rec)) {
/* When we open a cursor for a descending search, we must set
a next-key lock on the successor record: otherwise it would
be possible to insert new records next to the cursor position,
and it might be that these new records should appear in the
search result set, resulting in the phantom problem. */
if (!consistent_read) {
/* If innodb_locks_unsafe_for_binlog option is used
or this session is using READ COMMITTED isolation
level, we lock only the record, i.e., next-key
locking is not used. */
rec_t* next_rec = page_rec_get_next(rec);
ulint lock_type;
trx_t* trx;
trx = thr_get_trx(thr);
offsets = rec_get_offsets(next_rec, index, offsets,
ULINT_UNDEFINED, &heap);
if (srv_locks_unsafe_for_binlog
|| trx->isolation_level
<= TRX_ISO_READ_COMMITTED) {
if (page_rec_is_supremum(next_rec)) {
goto skip_lock;
}
lock_type = LOCK_REC_NOT_GAP;
} else {
lock_type = LOCK_ORDINARY;
}
err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
next_rec, index, offsets,
node->row_lock_mode,
LOCK_X_REGULAR,
lock_type, thr);
switch (err) {
case DB_SUCCESS_LOCKED_REC:
err = DB_SUCCESS;
case DB_SUCCESS:
break;
default:
/* Note that in this case we will store in pcur
the PREDECESSOR of the record we are waiting
the lock for */
goto lock_wait_or_error;
}
}
}
skip_lock:
if (page_rec_is_infimum(rec)) {
/* The infimum record on a page cannot be in the result set,
and neither can a record lock be placed on it: we skip such
a record. We also increment the cost counter as we may have
processed yet another page of index. */
cost_counter++;
goto next_rec;
}
if (!consistent_read) {
/* Try to place a lock on the index record */
/* If innodb_locks_unsafe_for_binlog option is used
or this session is using READ COMMITTED isolation level,
we lock only the record, i.e., next-key locking is
not used. */
ulint lock_type;
trx_t* trx;
offsets = rec_get_offsets(rec, index, offsets,
ULINT_UNDEFINED, &heap);
trx = thr_get_trx(thr);
if (srv_locks_unsafe_for_binlog
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
if (page_rec_is_supremum(rec)) {
goto next_rec;
}
lock_type = LOCK_REC_NOT_GAP;
} else {
lock_type = LOCK_ORDINARY;
}
err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
rec, index, offsets,
node->row_lock_mode,
LOCK_X_REGULAR,
lock_type, thr);
switch (err) {
case DB_SUCCESS_LOCKED_REC:
err = DB_SUCCESS;
case DB_SUCCESS:
break;
default:
goto lock_wait_or_error;
}
}
if (page_rec_is_supremum(rec)) {
/* A page supremum record cannot be in the result set: skip
it now when we have placed a possible lock on it */
goto next_rec;
}
ut_ad(page_rec_is_user_rec(rec));
if (cost_counter > SEL_COST_LIMIT) {
/* Now that we have placed the necessary locks, we can stop
for a while and store the cursor position; NOTE that if we
would store the cursor position BEFORE placing a record lock,
it might happen that the cursor would jump over some records
that another transaction could meanwhile insert adjacent to
the cursor: this would result in the phantom problem. */
goto stop_for_a_while;
}
/* PHASE 2: Check a mixed index mix id if needed */
if (plan->unique_search && cursor_just_opened) {
ut_ad(plan->mode == PAGE_CUR_GE);
/* As the cursor is now placed on a user record after a search
with the mode PAGE_CUR_GE, the up_match field in the cursor
tells how many fields in the user record matched to the search
tuple */
if (btr_pcur_get_up_match(&(plan->pcur))
< plan->n_exact_match) {
goto table_exhausted;
}
/* Ok, no need to test end_conds or mix id */
}
/* We are ready to look at a possible new index entry in the result
set: the cursor is now placed on a user record */
/* PHASE 3: Get previous version in a consistent read */
cons_read_requires_clust_rec = FALSE;
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
if (consistent_read) {
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
if (dict_index_is_clust(index)) {
if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
node->read_view)) {
err = row_sel_build_prev_vers(
node->read_view, index, rec,
&offsets, &heap, &plan->old_vers_heap,
&old_vers, &mtr);
if (err != DB_SUCCESS) {
goto lock_wait_or_error;
}
if (old_vers == NULL) {
/* The record does not exist
in our read view. Skip it, but
first attempt to determine
whether the index segment we
are searching through has been
exhausted. */
offsets = rec_get_offsets(
rec, index, offsets,
ULINT_UNDEFINED, &heap);
/* Fetch the columns needed in
test conditions. The clustered
index record is protected by a
page latch that was acquired
by row_sel_open_pcur() or
row_sel_restore_pcur_pos().
The latch will not be released
until mtr_commit(mtr). */
row_sel_fetch_columns(
index, rec, offsets,
UT_LIST_GET_FIRST(
plan->columns));
if (!row_sel_test_end_conds(plan)) {
goto table_exhausted;
}
goto next_rec;
}
rec = old_vers;
}
} else if (!lock_sec_rec_cons_read_sees(rec,
node->read_view)) {
cons_read_requires_clust_rec = TRUE;
}
}
/* PHASE 4: Test search end conditions and deleted flag */
/* Fetch the columns needed in test conditions. The record is
protected by a page latch that was acquired by
row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
will not be released until mtr_commit(mtr). */
row_sel_fetch_columns(index, rec, offsets,
UT_LIST_GET_FIRST(plan->columns));
/* Test the selection end conditions: these can only contain columns
which already are found in the index, even though the index might be
non-clustered */
if (plan->unique_search && cursor_just_opened) {
/* No test necessary: the test was already made above */
} else if (!row_sel_test_end_conds(plan)) {
goto table_exhausted;
}
if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
&& !cons_read_requires_clust_rec) {
/* The record is delete marked: we can skip it if this is
not a consistent read which might see an earlier version
of a non-clustered index record */
if (plan->unique_search) {
goto table_exhausted;
}
goto next_rec;
}
/* PHASE 5: Get the clustered index record, if needed and if we did
not do the search using the clustered index */
if (plan->must_get_clust || cons_read_requires_clust_rec) {
/* It was a non-clustered index and we must fetch also the
clustered index record */
err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
&mtr);
mtr_has_extra_clust_latch = TRUE;
if (err != DB_SUCCESS) {
goto lock_wait_or_error;
}
/* Retrieving the clustered record required a search:
increment the cost counter */
cost_counter++;
if (clust_rec == NULL) {
/* The record did not exist in the read view */
ut_ad(consistent_read);
goto next_rec;
}
if (rec_get_deleted_flag(clust_rec,
dict_table_is_comp(plan->table))) {
/* The record is delete marked: we can skip it */
goto next_rec;
}
if (node->can_get_updated) {
btr_pcur_store_position(&(plan->clust_pcur), &mtr);
}
}
/* PHASE 6: Test the rest of search conditions */
if (!row_sel_test_other_conds(plan)) {
if (plan->unique_search) {
goto table_exhausted;
}
goto next_rec;
}
/* PHASE 7: We found a new qualifying row for the current table; push
the row if prefetch is on, or move to the next table in the join */
plan->n_rows_fetched++;
ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
|| plan->unique_search || plan->no_prefetch
|| plan->table->big_rows) {
/* No prefetch in operation: go to the next table */
goto next_table;
}
sel_enqueue_prefetched_row(plan);
if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
/* The prefetch buffer is now full */
sel_dequeue_prefetched_row(plan);
goto next_table;
}
next_rec:
ut_ad(!search_latch_locked);
if (mtr_has_extra_clust_latch) {
/* We must commit &mtr if we are moving to the next
non-clustered index record, because we could break the
latching order if we would access a different clustered
index page right away without releasing the previous. */
goto commit_mtr_for_a_while;
}
if (node->asc) {
moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
} else {
moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
}
if (!moved) {
goto table_exhausted;
}
cursor_just_opened = FALSE;
/* END OF RECORD LOOP
------------------ */
goto rec_loop;
next_table:
/* We found a record which satisfies the conditions: we can move to
the next table or return a row in the result set */
ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
if (plan->unique_search && !node->can_get_updated) {
plan->cursor_at_end = TRUE;
} else {
ut_ad(!search_latch_locked);
plan->stored_cursor_rec_processed = TRUE;
btr_pcur_store_position(&(plan->pcur), &mtr);
}
mtr_commit(&mtr);
mtr_has_extra_clust_latch = FALSE;
next_table_no_mtr:
/* If we use 'goto' to this label, it means that the row was popped
from the prefetched rows stack, and &mtr is already committed */
if (node->fetch_table + 1 == node->n_tables) {
sel_eval_select_list(node);
if (node->is_aggregate) {
goto table_loop;
}
sel_assign_into_var_values(node->into_list, node);
thr->run_node = que_node_get_parent(node);
err = DB_SUCCESS;
goto func_exit;
}
node->fetch_table++;
/* When we move to the next table, we first reset the plan cursor:
we do not care about resetting it when we backtrack from a table */
plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
goto table_loop;
table_exhausted:
/* The table cursor pcur reached the result set end: backtrack to the
previous table in the join if we do not have cached prefetched rows */
plan->cursor_at_end = TRUE;
mtr_commit(&mtr);
mtr_has_extra_clust_latch = FALSE;
if (plan->n_rows_prefetched > 0) {
/* The table became exhausted during a prefetch */
sel_dequeue_prefetched_row(plan);
goto next_table_no_mtr;
}
table_exhausted_no_mtr:
if (node->fetch_table == 0) {
err = DB_SUCCESS;
if (node->is_aggregate && !node->aggregate_already_fetched) {
node->aggregate_already_fetched = TRUE;
sel_assign_into_var_values(node->into_list, node);
thr->run_node = que_node_get_parent(node);
} else {
node->state = SEL_NODE_NO_MORE_ROWS;
thr->run_node = que_node_get_parent(node);
}
goto func_exit;
}
node->fetch_table--;
goto table_loop;
stop_for_a_while:
/* Return control for a while to que_run_threads, so that runaway
queries can be canceled. NOTE that when we come here, we must, in a
locking read, have placed the necessary (possibly waiting request)
record lock on the cursor record or its successor: when we reposition
the cursor, this record lock guarantees that nobody can meanwhile have
inserted new records which should have appeared in the result set,
which would result in the phantom problem. */
ut_ad(!search_latch_locked);
plan->stored_cursor_rec_processed = FALSE;
btr_pcur_store_position(&(plan->pcur), &mtr);
mtr_commit(&mtr);
#ifdef UNIV_SYNC_DEBUG
ut_ad(sync_thread_levels_empty_except_dict());
#endif /* UNIV_SYNC_DEBUG */
err = DB_SUCCESS;
goto func_exit;
commit_mtr_for_a_while:
/* Stores the cursor position and commits &mtr; this is used if
&mtr may contain latches which would break the latching order if
&mtr would not be committed and the latches released. */
plan->stored_cursor_rec_processed = TRUE;
ut_ad(!search_latch_locked);
btr_pcur_store_position(&(plan->pcur), &mtr);
mtr_commit(&mtr);
mtr_has_extra_clust_latch = FALSE;
#ifdef UNIV_SYNC_DEBUG
ut_ad(sync_thread_levels_empty_except_dict());
#endif /* UNIV_SYNC_DEBUG */
goto table_loop;
lock_wait_or_error:
/* See the note at stop_for_a_while: the same holds for this case */
ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
ut_ad(!search_latch_locked);
plan->stored_cursor_rec_processed = FALSE;
btr_pcur_store_position(&(plan->pcur), &mtr);
mtr_commit(&mtr);
#ifdef UNIV_SYNC_DEBUG
ut_ad(sync_thread_levels_empty_except_dict());
#endif /* UNIV_SYNC_DEBUG */
func_exit:
if (search_latch_locked) {
rw_lock_s_unlock(&btr_search_latch);
}
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
return(err);
}