sql/records.cc (494 lines of code) (raw):
/* Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software Foundation,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
/**
@file
@brief
Functions for easy reading of records, possible through a cache
*/
#include "sql/records.h"
#include <string.h>
#include "field.h"
#include "filesort.h" // filesort_free_buffers
#include "handler.h"
#include "item.h"
#include "my_byteorder.h"
#include "my_dbug.h"
#include "my_pointer_arithmetic.h"
#include "my_sys.h"
#include "my_thread_local.h"
#include "mysql/service_mysql_alloc.h"
#include "opt_range.h" // QUICK_SELECT_I
#include "psi_memory_key.h"
#include "sort_param.h"
#include "sql_class.h" // THD
#include "sql_const.h"
#include "sql_executor.h" // QEP_TAB
#include "sql_sort.h"
#include "sql_string.h"
#include "system_variables.h"
#include "table.h"
#include "thr_lock.h"
#include "varlen_sort.h"
static int rr_quick(READ_RECORD *info);
static int rr_from_tempfile(READ_RECORD *info);
template<bool> static int rr_unpack_from_tempfile(READ_RECORD *info);
template<bool> static int rr_unpack_from_buffer(READ_RECORD *info);
static int rr_from_pointers(READ_RECORD *info);
static int rr_from_cache(READ_RECORD *info);
static int init_rr_cache(THD *thd, READ_RECORD *info);
static int rr_index_first(READ_RECORD *info);
static int rr_index_last(READ_RECORD *info);
static int rr_index(READ_RECORD *info);
static int rr_index_desc(READ_RECORD *info);
/**
Initialize READ_RECORD structure to perform full index scan in desired
direction using read_record.read_record() interface
This function has been added at late stage and is used only by
UPDATE/DELETE. Other statements perform index scans using
join_read_first/next functions.
@param info READ_RECORD structure to initialize.
@param thd Thread handle
@param table Table to be accessed
@param print_error If true, call table->file->print_error() if an error
occurs (except for end-of-records error)
@param idx index to scan
@param reverse Scan in the reverse direction
@retval true error
@retval false success
*/
bool init_read_record_idx(READ_RECORD *info, THD *thd, TABLE *table,
bool print_error, uint idx, bool reverse)
{
int error;
empty_record(table);
memset(info, 0, sizeof(*info));
info->thd= thd;
info->table= table;
info->record= table->record[0];
info->print_error= print_error;
info->unlock_row= rr_unlock_row;
if (!table->file->inited &&
(error= table->file->ha_index_init(idx, 1)))
{
if (print_error)
table->file->print_error(error, MYF(0));
return true;
}
/* read_record will be changed to rr_index in rr_index_first */
info->read_record= reverse ? rr_index_last : rr_index_first;
return false;
}
/*
init_read_record is used to scan by using a number of different methods.
Which method to use is set-up in this call so that later calls to
the info->read_record will call the appropriate method using a function
pointer.
There are five methods that relate completely to the sort function
filesort. The result of a filesort is retrieved using read_record
calls. The other two methods are used for normal table access.
The filesort will produce references to the records sorted, these
references can be stored in memory or in a temporary file.
The temporary file is normally used when the references doesn't fit into
a properly sized memory buffer. For most small queries the references
are stored in the memory buffer.
SYNOPSIS
init_read_record()
info OUT read structure
thd Thread handle
table Table the data [originally] comes from; if NULL,
'table' is inferred from 'qep_tab'; if non-NULL, 'qep_tab' must be NULL.
qep_tab QEP_TAB for 'table', if there is one; we may use
qep_tab->quick() as data source
use_record_cache Call file->extra_opt(HA_EXTRA_CACHE,...)
if we're going to do sequential read and some
additional conditions are satisfied.
print_error Copy this to info->print_error
disable_rr_cache Don't use rr_from_cache (used by sort-union
index-merge which produces rowid sequences that
are already ordered)
DESCRIPTION
This function sets up reading data via one of the methods:
The temporary file is also used when performing an update where a key is
modified.
Methods used when ref's are in memory (using rr_from_pointers):
rr_unpack_from_buffer:
----------------------
This method is used when table->sort.addon_field is allocated.
This is allocated for most SELECT queries not involving any BLOB's.
In this case the records are fetched from a memory buffer.
rr_from_pointers:
-----------------
Used when the above is not true, UPDATE, DELETE and so forth and
SELECT's involving BLOB's. It is also used when the addon_field
buffer is not allocated due to that its size was bigger than the
session variable max_length_for_sort_data.
In this case the record data is fetched from the handler using the
saved reference using the rnd_pos handler call.
Methods used when ref's are in a temporary file (using rr_from_tempfile)
rr_unpack_from_tempfile:
------------------------
Same as rr_unpack_from_buffer except that references are fetched from
temporary file. Should obviously not really happen other than in
strange configurations.
rr_from_tempfile:
-----------------
Same as rr_from_pointers except that references are fetched from
temporary file instead of from
rr_from_cache:
--------------
This is a special variant of rr_from_tempfile that can be used for
handlers that is not using the HA_FAST_KEY_READ table flag. Instead
of reading the references one by one from the temporary file it reads
a set of them, sorts them and reads all of them into a buffer which
is then used for a number of subsequent calls to rr_from_cache.
It is only used for SELECT queries and a number of other conditions
on table size.
All other accesses use either index access methods (rr_quick) or a full
table scan (rr_sequential).
rr_quick:
---------
rr_quick uses one of the QUICK_SELECT classes in opt_range.cc to
perform an index scan. There are loads of functionality hidden
in these quick classes. It handles all index scans of various kinds.
rr_sequential:
--------------
This is the most basic access method of a table using rnd_init,
ha_rnd_next and rnd_end. No indexes are used.
@retval true error
@retval false success
*/
bool init_read_record(READ_RECORD *info,THD *thd,
TABLE *table, QEP_TAB *qep_tab,
int use_record_cache, bool print_error,
bool disable_rr_cache)
{
int error= 0;
IO_CACHE *tempfile;
DBUG_ENTER("init_read_record");
// If only 'table' is given, assume no quick, no condition.
DBUG_ASSERT(!(table && qep_tab));
if (!table)
table= qep_tab->table();
memset(info, 0, sizeof(*info));
info->thd=thd;
info->table=table;
info->forms= &info->table; /* Only one table */
if (table->s->tmp_table == NON_TRANSACTIONAL_TMP_TABLE &&
!table->sort.using_addon_fields())
(void) table->file->extra(HA_EXTRA_MMAP);
if (table->sort.using_addon_fields())
{
info->rec_buf= table->sort.addon_fields->get_addon_buf();
info->ref_length= table->sort.addon_fields->get_addon_buf_length();
}
else
{
empty_record(table);
info->record= table->record[0];
info->ref_length= table->file->ref_length;
}
info->quick= qep_tab ? qep_tab->quick() : NULL;
info->print_error=print_error;
info->unlock_row= rr_unlock_row;
info->ignore_not_found_rows= 0;
// Initialize for a scan over a set of rows
if (info->quick && info->quick->clustered_pk_range())
{
/*
In case of QUICK_INDEX_MERGE_SELECT with clustered pk range we have to
use its own access method(i.e QUICK_INDEX_MERGE_SELECT::get_next()) as
sort file does not contain rowids which satisfy clustered pk range.
*/
tempfile= 0;
}
else
tempfile= table->sort.io_cache;
if (tempfile && my_b_inited(tempfile)) // Test if ref-records was used
{
if (table->sort.using_addon_fields())
{
DBUG_PRINT("info",("using rr_unpack_from_tempfile"));
if (table->sort.addon_fields->using_packed_addons())
info->read_record= rr_unpack_from_tempfile<true>;
else
info->read_record= rr_unpack_from_tempfile<false>;
}
else
{
DBUG_PRINT("info",("using rr_from_tempfile"));
info->read_record= rr_from_tempfile;
}
info->io_cache=tempfile;
reinit_io_cache(info->io_cache,READ_CACHE,0L,0,0);
info->ref_pos=table->file->ref;
if (!table->file->inited &&
(error= table->file->ha_rnd_init(0)))
goto err;
/*
table->sort.addon_field is checked because if we use addon fields,
it doesn't make sense to use cache - we don't read from the table
and table->sort.io_cache is read sequentially
*/
if (!disable_rr_cache &&
!table->sort.using_addon_fields() &&
thd->variables.read_rnd_buff_size &&
!(table->file->ha_table_flags() & HA_FAST_KEY_READ) &&
(table->db_stat & HA_READ_ONLY ||
table->reginfo.lock_type <= TL_READ_NO_INSERT) &&
(ulonglong) table->s->reclength* (table->file->stats.records+
table->file->stats.deleted) >
(ulonglong) MIN_FILE_LENGTH_TO_USE_ROW_CACHE &&
info->io_cache->end_of_file/info->ref_length * table->s->reclength >
(my_off_t) MIN_ROWS_TO_USE_TABLE_CACHE &&
!table->s->blob_fields &&
info->ref_length <= MAX_REFLENGTH)
{
if (init_rr_cache(thd, info))
goto skip_caching;
DBUG_PRINT("info",("using rr_from_cache"));
info->read_record=rr_from_cache;
}
}
else if (info->quick)
{
DBUG_PRINT("info",("using rr_quick"));
info->read_record=rr_quick;
}
// See save_index() which stores the filesort result set.
else if (table->sort.has_filesort_result_in_memory())
{
if ((error= table->file->ha_rnd_init(0)))
goto err;
info->cache_pos=table->sort.sorted_result;
if (table->sort.using_addon_fields())
{
DBUG_PRINT("info",("using rr_unpack_from_buffer"));
DBUG_ASSERT(table->sort.sorted_result_in_fsbuf);
info->unpack_counter= 0;
if (table->sort.addon_fields->using_packed_addons())
info->read_record= rr_unpack_from_buffer<true>;
else
info->read_record= rr_unpack_from_buffer<false>;
info->cache_end= table->sort.sorted_result_end;
}
else
{
DBUG_PRINT("info",("using rr_from_pointers"));
info->read_record= rr_from_pointers;
info->cache_end=
info->cache_pos + table->sort.found_records * info->ref_length;
}
}
else
{
DBUG_PRINT("info",("using rr_sequential"));
info->read_record=rr_sequential;
if ((error= table->file->ha_rnd_init(1)))
goto err;
/* We can use record cache if we don't update dynamic length tables */
if (!table->no_cache &&
(use_record_cache > 0 ||
(int) table->reginfo.lock_type <= (int) TL_READ_HIGH_PRIORITY ||
!(table->s->db_options_in_use & HA_OPTION_PACK_RECORD) ||
(use_record_cache < 0 &&
!(table->file->ha_table_flags() & HA_NOT_DELETE_WITH_CACHE))))
(void) table->file->extra_opt(HA_EXTRA_CACHE,
thd->variables.read_buff_size);
}
skip_caching:
/*
Do condition pushdown for UPDATE/DELETE.
TODO: Remove this from here as it causes two condition pushdown calls
when we're running a SELECT and the condition cannot be pushed down.
Some temporary tables do not have a TABLE_LIST object, and it is never
needed to push down conditions (ECP) for such tables.
*/
if (thd->optimizer_switch_flag(OPTIMIZER_SWITCH_ENGINE_CONDITION_PUSHDOWN) &&
qep_tab && qep_tab->condition() && table->pos_in_table_list &&
(qep_tab->condition()->used_tables() & table->pos_in_table_list->map()) &&
!table->file->pushed_cond)
table->file->cond_push(qep_tab->condition());
DBUG_RETURN(false);
err:
if (print_error)
table->file->print_error(error, MYF(0));
DBUG_RETURN(true);
} /* init_read_record */
void end_read_record(READ_RECORD *info)
{ /* free cache if used */
if (info->cache)
{
my_free(info->cache);
info->cache=0;
}
if (info->table && info->table->key_read)
{
info->table->set_keyread(FALSE);
}
if (info->table && info->table->is_created())
{
filesort_free_buffers(info->table,0);
(void) info->table->file->extra(HA_EXTRA_NO_CACHE);
if (info->read_record != rr_quick) // otherwise quick_range does it
(void) info->table->file->ha_index_or_rnd_end();
info->table=0;
}
}
static int rr_handle_error(READ_RECORD *info, int error)
{
if (info->thd->killed)
{
info->thd->send_kill_message();
return 1;
}
if (error == HA_ERR_END_OF_FILE)
error= -1;
else
{
if (info->print_error)
info->table->file->print_error(error, MYF(0));
if (error < 0) // Fix negative BDB errno
error= 1;
}
return error;
}
/** Read a record from head-database. */
static int rr_quick(READ_RECORD *info)
{
int tmp;
while ((tmp= info->quick->get_next()))
{
if (info->thd->killed || (tmp != HA_ERR_RECORD_DELETED))
{
tmp= rr_handle_error(info, tmp);
break;
}
}
return tmp;
}
/**
Reads first row in an index scan.
@param info Scan info
@retval
0 Ok
@retval
-1 End of records
@retval
1 Error
*/
static int rr_index_first(READ_RECORD *info)
{
int tmp= info->table->file->ha_index_first(info->record);
info->read_record= rr_index;
if (tmp)
tmp= rr_handle_error(info, tmp);
return tmp;
}
/**
Reads last row in an index scan.
@param info Scan info
@retval
0 Ok
@retval
-1 End of records
@retval
1 Error
*/
static int rr_index_last(READ_RECORD *info)
{
int tmp= info->table->file->ha_index_last(info->record);
info->read_record= rr_index_desc;
if (tmp)
tmp= rr_handle_error(info, tmp);
return tmp;
}
/**
Reads index sequentially after first row.
Read the next index record (in forward direction) and translate return
value.
@param info Scan info
@retval
0 Ok
@retval
-1 End of records
@retval
1 Error
*/
static int rr_index(READ_RECORD *info)
{
int tmp= info->table->file->ha_index_next(info->record);
if (tmp)
tmp= rr_handle_error(info, tmp);
return tmp;
}
/**
Reads index sequentially from the last row to the first.
Read the prev index record (in backward direction) and translate return
value.
@param info Scan info
@retval
0 Ok
@retval
-1 End of records
@retval
1 Error
*/
static int rr_index_desc(READ_RECORD *info)
{
int tmp= info->table->file->ha_index_prev(info->record);
if (tmp)
tmp= rr_handle_error(info, tmp);
return tmp;
}
int rr_sequential(READ_RECORD *info)
{
int tmp;
while ((tmp=info->table->file->ha_rnd_next(info->record)))
{
/*
ha_rnd_next can return RECORD_DELETED for MyISAM when one thread is
reading and another deleting without locks.
*/
if (info->thd->killed || (tmp != HA_ERR_RECORD_DELETED))
{
tmp= rr_handle_error(info, tmp);
break;
}
}
return tmp;
}
static int rr_from_tempfile(READ_RECORD *info)
{
int tmp;
for (;;)
{
if (my_b_read(info->io_cache,info->ref_pos,info->ref_length))
return -1; /* End of file */
if (!(tmp=info->table->file->ha_rnd_pos(info->record,info->ref_pos)))
break;
/* The following is extremely unlikely to happen */
if (tmp == HA_ERR_RECORD_DELETED ||
(tmp == HA_ERR_KEY_NOT_FOUND && info->ignore_not_found_rows))
continue;
tmp= rr_handle_error(info, tmp);
break;
}
return tmp;
} /* rr_from_tempfile */
template<bool Packed_addon_fields>
inline void Filesort_info::unpack_addon_fields(uchar *buff)
{
Sort_addon_field *addonf= addon_fields->begin();
const uchar *start_of_record= buff + addonf->offset;
for ( ; addonf != addon_fields->end(); ++addonf)
{
Field *field= addonf->field;
if (addonf->null_bit && (addonf->null_bit & buff[addonf->null_offset]))
{
field->set_null();
continue;
}
field->set_notnull();
if (Packed_addon_fields)
start_of_record= field->unpack(field->ptr, start_of_record);
else
field->unpack(field->ptr, buff + addonf->offset);
}
}
/**
Read a result set record from a temporary file after sorting.
The function first reads the next sorted record from the temporary file.
into a buffer. If a success it calls a callback function that unpacks
the fields values use in the result set from this buffer into their
positions in the regular record buffer.
@param info Reference to the context including record descriptors
@tparam Packed_addon_fields Are the addon fields packed?
This is a compile-time constant, to avoid if (....) tests during execution.
@retval
0 Record successfully read.
@retval
-1 There is no record to be read anymore.
*/
template<bool Packed_addon_fields>
static int rr_unpack_from_tempfile(READ_RECORD *info)
{
uchar *destination= info->rec_buf;
#ifndef DBUG_OFF
my_off_t where= my_b_tell(info->io_cache);
#endif
if (Packed_addon_fields)
{
const uint len_sz= Addon_fields::size_of_length_field;
// First read length of the record.
if (my_b_read(info->io_cache, destination, len_sz))
return -1;
uint res_length= Addon_fields::read_addon_length(destination);
DBUG_PRINT("info", ("rr_unpack from %llu to %p sz %u",
static_cast<ulonglong>(where),
destination, res_length));
DBUG_ASSERT(res_length > len_sz);
DBUG_ASSERT(info->table->sort.using_addon_fields());
// Then read the rest of the record.
if (my_b_read(info->io_cache, destination + len_sz, res_length - len_sz))
return -1; /* purecov: inspected */
}
else
{
if (my_b_read(info->io_cache, destination, info->ref_length))
return -1;
}
info->table->sort.unpack_addon_fields<Packed_addon_fields>(destination);
return 0;
}
static int rr_from_pointers(READ_RECORD *info)
{
int tmp;
uchar *cache_pos;
for (;;)
{
if (info->cache_pos == info->cache_end)
return -1; /* End of file */
cache_pos= info->cache_pos;
info->cache_pos+= info->ref_length;
if (!(tmp=info->table->file->ha_rnd_pos(info->record,cache_pos)))
break;
/* The following is extremely unlikely to happen */
if (tmp == HA_ERR_RECORD_DELETED ||
(tmp == HA_ERR_KEY_NOT_FOUND && info->ignore_not_found_rows))
continue;
tmp= rr_handle_error(info, tmp);
break;
}
return tmp;
}
/**
Read a result set record from a buffer after sorting.
Get the next record from the filesort buffer,
then unpack the fields into their positions in the regular record buffer.
@param info Reference to the context including record descriptors
@tparam Packed_addon_fields Are the addon fields packed?
This is a compile-time constant, to avoid if (....) tests during execution.
TODO: consider templatizing on is_varlen as well.
Variable / Fixed size key is currently handled by
Filesort_info::get_start_of_payload
@retval
0 Record successfully read.
@retval
-1 There is no record to be read anymore.
*/
template<bool Packed_addon_fields>
static int rr_unpack_from_buffer(READ_RECORD *info)
{
if (info->unpack_counter == info->table->sort.found_records)
return -1; /* End of buffer */
uchar *record= info->table->sort.get_sorted_record(
static_cast<uint>(info->unpack_counter));
uchar *payload= get_start_of_payload(&info->table->sort, record);
info->table->sort.unpack_addon_fields<Packed_addon_fields>(payload);
info->unpack_counter++;
return 0;
}
/* cacheing of records from a database */
/**
Initialize caching of records from temporary file.
@retval
0 OK, use caching.
1 Buffer is too small, or cannot be allocated.
Skip caching, and read records directly from temporary file.
*/
static int init_rr_cache(THD *thd, READ_RECORD *info)
{
uint rec_cache_size;
DBUG_ENTER("init_rr_cache");
READ_RECORD info_copy= *info;
info->struct_length= 3+MAX_REFLENGTH;
info->reclength= ALIGN_SIZE(info->table->s->reclength+1);
if (info->reclength < info->struct_length)
info->reclength= ALIGN_SIZE(info->struct_length);
info->error_offset= info->table->s->reclength;
info->cache_records= (thd->variables.read_rnd_buff_size /
(info->reclength+info->struct_length));
rec_cache_size= info->cache_records*info->reclength;
info->rec_cache_size= info->cache_records*info->ref_length;
if (info->cache_records <= 2 ||
!(info->cache=(uchar*) my_malloc(key_memory_READ_RECORD_cache,
rec_cache_size+info->cache_records*
info->struct_length,
MYF(0))))
{
*info= info_copy;
DBUG_RETURN(1);
}
DBUG_PRINT("info",("Allocated buffert for %d records",info->cache_records));
info->read_positions=info->cache+rec_cache_size;
info->cache_pos=info->cache_end=info->cache;
DBUG_RETURN(0);
} /* init_rr_cache */
static int rr_from_cache(READ_RECORD *info)
{
uint i;
ulong length;
my_off_t rest_of_file;
int16 error;
uchar *position,*ref_position,*record_pos;
ulong record;
for (;;)
{
if (info->cache_pos != info->cache_end)
{
if (info->cache_pos[info->error_offset])
{
shortget(&error, info->cache_pos);
if (info->print_error)
info->table->file->print_error(error,MYF(0));
}
else
{
error=0;
memcpy(info->record,info->cache_pos,
(size_t) info->table->s->reclength);
}
info->cache_pos+=info->reclength;
return ((int) error);
}
length=info->rec_cache_size;
rest_of_file=info->io_cache->end_of_file - my_b_tell(info->io_cache);
if ((my_off_t) length > rest_of_file)
length= (ulong) rest_of_file;
if (!length || my_b_read(info->io_cache,info->cache,length))
{
DBUG_PRINT("info",("Found end of file"));
return -1; /* End of file */
}
length/=info->ref_length;
position=info->cache;
ref_position=info->read_positions;
for (i=0 ; i < length ; i++,position+=info->ref_length)
{
memcpy(ref_position,position,(size_t) info->ref_length);
ref_position+=MAX_REFLENGTH;
int3store(ref_position,(long) i);
ref_position+=3;
}
size_t ref_length= info->ref_length;
DBUG_ASSERT(ref_length <= MAX_REFLENGTH);
varlen_sort(info->read_positions,
info->read_positions + length * info->struct_length,
info->struct_length,
[ref_length](const uchar *a, const uchar *b)
{
return memcmp(a, b, ref_length) < 0;
});
position=info->read_positions;
for (i=0 ; i < length ; i++)
{
memcpy(info->ref_pos,position,(size_t) info->ref_length);
position+=MAX_REFLENGTH;
record=uint3korr(position);
position+=3;
record_pos=info->cache+record*info->reclength;
error= (int16) info->table->file->ha_rnd_pos(record_pos, info->ref_pos);
if (error)
{
record_pos[info->error_offset]=1;
shortstore(record_pos,error);
DBUG_PRINT("error",("Got error: %d:%d when reading row",
my_errno(), (int) error));
}
else
record_pos[info->error_offset]=0;
}
info->cache_end=(info->cache_pos=info->cache)+length*info->reclength;
}
} /* rr_from_cache */
/**
The default implementation of unlock-row method of READ_RECORD,
used in all access methods.
*/
void rr_unlock_row(QEP_TAB *tab)
{
READ_RECORD *info= &tab->read_record;
info->table->file->unlock_row();
}