uint fast_alter_partition_table()

in sql/sql_partition.cc [6925:7292]


uint fast_alter_partition_table(THD *thd, TABLE *table,
                                Alter_info *alter_info,
                                HA_CREATE_INFO *create_info,
                                TABLE_LIST *table_list,
                                char *db,
                                const char *table_name,
                                partition_info *new_part_info)
{
  /* Set-up struct used to write frm files */
  partition_info *part_info;
  ALTER_PARTITION_PARAM_TYPE lpt_obj;
  ALTER_PARTITION_PARAM_TYPE *lpt= &lpt_obj;
  bool action_completed= FALSE;
  bool frm_install= FALSE;
  MDL_ticket *mdl_ticket= table->mdl_ticket;
  DBUG_ENTER("fast_alter_partition_table");
  DBUG_ASSERT(table->m_needs_reopen);

  part_info= new_part_info;
  lpt->thd= thd;
  lpt->table_list= table_list;
  lpt->part_info= part_info;
  lpt->alter_info= alter_info;
  lpt->create_info= create_info;
  lpt->db_options= create_info->table_options;
  if (create_info->row_type == ROW_TYPE_DYNAMIC)
    lpt->db_options|= HA_OPTION_PACK_RECORD;
  lpt->table= table;
  lpt->key_info_buffer= 0;
  lpt->key_count= 0;
  lpt->db= db;
  lpt->table_name= table_name;
  lpt->copied= 0;
  lpt->deleted= 0;
  lpt->pack_frm_data= NULL;
  lpt->pack_frm_len= 0;

  if (table->file->alter_table_flags(alter_info->flags) &
        HA_PARTITION_ONE_PHASE)
  {
    /*
      In the case where the engine supports one phase online partition
      changes it is not necessary to have any exclusive locks. The
      correctness is upheld instead by transactions being aborted if they
      access the table after its partition definition has changed (if they
      are still using the old partition definition).

      The handler is in this case responsible to ensure that all users
      start using the new frm file after it has changed. To implement
      one phase it is necessary for the handler to have the master copy
      of the frm file and use discovery mechanisms to renew it. Thus
      write frm will write the frm, pack the new frm and finally
      the frm is deleted and the discovery mechanisms will either restore
      back to the old or installing the new after the change is activated.

      Thus all open tables will be discovered that they are old, if not
      earlier as soon as they try an operation using the old table. One
      should ensure that this is checked already when opening a table,
      even if it is found in the cache of open tables.

      change_partitions will perform all operations and it is the duty of
      the handler to ensure that the frm files in the system gets updated
      in synch with the changes made and if an error occurs that a proper
      error handling is done.

      If the MySQL Server crashes at this moment but the handler succeeds
      in performing the change then the binlog is not written for the
      change. There is no way to solve this as long as the binlog is not
      transactional and even then it is hard to solve it completely.

      The first approach here was to downgrade locks. Now a different approach
      is decided upon. The idea is that the handler will have access to the
      Alter_info when store_lock arrives with TL_WRITE_ALLOW_READ. So if the
      handler knows that this functionality can be handled with a lower lock
      level it will set the lock level to TL_WRITE_ALLOW_WRITE immediately.
      Thus the need to downgrade the lock disappears.
      1) Write the new frm, pack it and then delete it
      2) Perform the change within the handler
    */
    if (mysql_write_frm(lpt, WFRM_WRITE_SHADOW | WFRM_PACK_FRM) ||
        mysql_change_partitions(lpt))
    {
      goto err;
    }
  }
  else if (alter_info->flags & Alter_info::ALTER_DROP_PARTITION)
  {
    /*
      Now after all checks and setting state on dropped partitions we can
      start the actual dropping of the partitions.

      Drop partition is actually two things happening. The first is that
      a lot of records are deleted. The second is that the behaviour of
      subsequent updates and writes and deletes will change. The delete
      part can be handled without any particular high lock level by
      transactional engines whereas non-transactional engines need to
      ensure that this change is done with an exclusive lock on the table.
      The second part, the change of partitioning does however require
      an exclusive lock to install the new partitioning as one atomic
      operation. If this is not the case, it is possible for two
      transactions to see the change in a different order than their
      serialisation order. Thus we need an exclusive lock for both
      transactional and non-transactional engines.

      For LIST partitions it could be possible to avoid the exclusive lock
      (and for RANGE partitions if they didn't rearrange range definitions
      after a DROP PARTITION) if one ensured that failed accesses to the
      dropped partitions was aborted for sure (thus only possible for
      transactional engines).

      0) Write an entry that removes the shadow frm file if crash occurs
      1) Write the new frm file as a shadow frm
      2) Get an exclusive metadata lock on the table (waits for all active
         transactions using this table). This ensures that we
         can release all other locks on the table and since no one can open
         the table, there can be no new threads accessing the table. They
         will be hanging on this exclusive lock.
      3) Write the ddl log to ensure that the operation is completed
         even in the presence of a MySQL Server crash (the log is executed
         before any other threads are started, so there are no locking issues).
      4) Close the table that have already been opened but didn't stumble on
         the abort locked previously. This is done as part of the
         alter_close_table call.
      5) Write the bin log
         Unfortunately the writing of the binlog is not synchronised with
         other logging activities. So no matter in which order the binlog
         is written compared to other activities there will always be cases
         where crashes make strange things occur. In this placement it can
         happen that the ALTER TABLE DROP PARTITION gets performed in the
         master but not in the slaves if we have a crash, after writing the
         ddl log but before writing the binlog. A solution to this would
         require writing the statement first in the ddl log and then
         when recovering from the crash read the binlog and insert it into
         the binlog if not written already.
      6) Install the previously written shadow frm file
      7) Prepare handlers for drop of partitions
      8) Drop the partitions
      9) Remove entries from ddl log
      10) Reopen table if under lock tables
      11) Complete query

      We insert Error injections at all places where it could be interesting
      to test if recovery is properly done.
    */
    if (write_log_drop_shadow_frm(lpt) ||
        ERROR_INJECT_CRASH("crash_drop_partition_1") ||
        ERROR_INJECT_ERROR("fail_drop_partition_1") ||
        mysql_write_frm(lpt, WFRM_WRITE_SHADOW) ||
        ERROR_INJECT_CRASH("crash_drop_partition_2") ||
        ERROR_INJECT_ERROR("fail_drop_partition_2") ||
        wait_while_table_is_used(thd, table, HA_EXTRA_FORCE_REOPEN) ||
        ERROR_INJECT_CRASH("crash_drop_partition_3") ||
        ERROR_INJECT_ERROR("fail_drop_partition_3") ||
        write_log_drop_partition(lpt) ||
        (action_completed= TRUE, FALSE) ||
        ERROR_INJECT_CRASH("crash_drop_partition_4") ||
        ERROR_INJECT_ERROR("fail_drop_partition_4") ||
        alter_close_table(lpt) ||
        ERROR_INJECT_CRASH("crash_drop_partition_5") ||
        ERROR_INJECT_ERROR("fail_drop_partition_5") ||
        ((!thd->lex->no_write_to_binlog) &&
         (write_bin_log(thd, FALSE,
                        thd->query(), thd->query_length()), FALSE)) ||
        ERROR_INJECT_CRASH("crash_drop_partition_6") ||
        ERROR_INJECT_ERROR("fail_drop_partition_6") ||
        (frm_install= TRUE, FALSE) ||
        mysql_write_frm(lpt, WFRM_INSTALL_SHADOW) ||
        (frm_install= FALSE, FALSE) ||
        ERROR_INJECT_CRASH("crash_drop_partition_7") ||
        ERROR_INJECT_ERROR("fail_drop_partition_7") ||
        mysql_drop_partitions(lpt) ||
        ERROR_INJECT_CRASH("crash_drop_partition_8") ||
        ERROR_INJECT_ERROR("fail_drop_partition_8") ||
        (write_log_completed(lpt, FALSE), FALSE) ||
        ERROR_INJECT_CRASH("crash_drop_partition_9") ||
        ERROR_INJECT_ERROR("fail_drop_partition_9") ||
        (alter_partition_lock_handling(lpt), FALSE))
    {
      handle_alter_part_error(lpt, action_completed, TRUE, frm_install);
      goto err;
    }
  }
  else if ((alter_info->flags & Alter_info::ALTER_ADD_PARTITION) &&
           (part_info->part_type == RANGE_PARTITION ||
            part_info->part_type == LIST_PARTITION))
  {
    /*
      ADD RANGE/LIST PARTITIONS
      In this case there are no tuples removed and no tuples are added.
      Thus the operation is merely adding a new partition. Thus it is
      necessary to perform the change as an atomic operation. Otherwise
      someone reading without seeing the new partition could potentially
      miss updates made by a transaction serialised before it that are
      inserted into the new partition.

      0) Write an entry that removes the shadow frm file if crash occurs
      1) Write the new frm file as a shadow frm file
      2) Get an exclusive metadata lock on the table (waits for all active
         transactions using this table). This ensures that we
         can release all other locks on the table and since no one can open
         the table, there can be no new threads accessing the table. They
         will be hanging on this exclusive lock.
      3) Write an entry to remove the new parttions if crash occurs
      4) Add the new partitions.
      5) Close all instances of the table and remove them from the table cache.
      6) Write binlog
      7) Now the change is completed except for the installation of the
         new frm file. We thus write an action in the log to change to
         the shadow frm file
      8) Install the new frm file of the table where the partitions are
         added to the table.
      9) Remove entries from ddl log
      10)Reopen tables if under lock tables
      11)Complete query
    */
    if (write_log_drop_shadow_frm(lpt) ||
        ERROR_INJECT_CRASH("crash_add_partition_1") ||
        ERROR_INJECT_ERROR("fail_add_partition_1") ||
        mysql_write_frm(lpt, WFRM_WRITE_SHADOW) ||
        ERROR_INJECT_CRASH("crash_add_partition_2") ||
        ERROR_INJECT_ERROR("fail_add_partition_2") ||
        wait_while_table_is_used(thd, table, HA_EXTRA_FORCE_REOPEN) ||
        ERROR_INJECT_CRASH("crash_add_partition_3") ||
        ERROR_INJECT_ERROR("fail_add_partition_3") ||
        write_log_add_change_partition(lpt) ||
        ERROR_INJECT_CRASH("crash_add_partition_4") ||
        ERROR_INJECT_ERROR("fail_add_partition_4") ||
        mysql_change_partitions(lpt) ||
        ERROR_INJECT_CRASH("crash_add_partition_5") ||
        ERROR_INJECT_ERROR("fail_add_partition_5") ||
        alter_close_table(lpt) ||
        ERROR_INJECT_CRASH("crash_add_partition_6") ||
        ERROR_INJECT_ERROR("fail_add_partition_6") ||
        ((!thd->lex->no_write_to_binlog) &&
         (write_bin_log(thd, FALSE,
                        thd->query(), thd->query_length()), FALSE)) ||
        ERROR_INJECT_CRASH("crash_add_partition_7") ||
        ERROR_INJECT_ERROR("fail_add_partition_7") ||
        write_log_rename_frm(lpt) ||
        (action_completed= TRUE, FALSE) ||
        ERROR_INJECT_CRASH("crash_add_partition_8") ||
        ERROR_INJECT_ERROR("fail_add_partition_8") ||
        (frm_install= TRUE, FALSE) ||
        mysql_write_frm(lpt, WFRM_INSTALL_SHADOW) ||
        (frm_install= FALSE, FALSE) ||
        ERROR_INJECT_CRASH("crash_add_partition_9") ||
        ERROR_INJECT_ERROR("fail_add_partition_9") ||
        (write_log_completed(lpt, FALSE), FALSE) ||
        ERROR_INJECT_CRASH("crash_add_partition_10") ||
        ERROR_INJECT_ERROR("fail_add_partition_10") ||
        (alter_partition_lock_handling(lpt), FALSE))
    {
      handle_alter_part_error(lpt, action_completed, FALSE, frm_install);
      goto err;
    }
  }
  else
  {
    /*
      ADD HASH PARTITION/
      COALESCE PARTITION/
      REBUILD PARTITION/
      REORGANIZE PARTITION

      In this case all records are still around after the change although
      possibly organised into new partitions, thus by ensuring that all
      updates go to both the old and the new partitioning scheme we can
      actually perform this operation lock-free. The only exception to
      this is when REORGANIZE PARTITION adds/drops ranges. In this case
      there needs to be an exclusive lock during the time when the range
      changes occur.
      This is only possible if the handler can ensure double-write for a
      period. The double write will ensure that it doesn't matter where the
      data is read from since both places are updated for writes. If such
      double writing is not performed then it is necessary to perform the
      change with the usual exclusive lock. With double writes it is even
      possible to perform writes in parallel with the reorganisation of
      partitions.

      Without double write procedure we get the following procedure.
      The only difference with using double write is that we can downgrade
      the lock to TL_WRITE_ALLOW_WRITE. Double write in this case only
      double writes from old to new. If we had double writing in both
      directions we could perform the change completely without exclusive
      lock for HASH partitions.
      Handlers that perform double writing during the copy phase can actually
      use a lower lock level. This can be handled inside store_lock in the
      respective handler.

      0) Write an entry that removes the shadow frm file if crash occurs.
      1) Write the shadow frm file of new partitioning.
      2) Log such that temporary partitions added in change phase are
         removed in a crash situation.
      3) Add the new partitions.
         Copy from the reorganised partitions to the new partitions.
      4) Get an exclusive metadata lock on the table (waits for all active
         transactions using this table). This ensures that we
         can release all other locks on the table and since no one can open
         the table, there can be no new threads accessing the table. They
         will be hanging on this exclusive lock.
      5) Close the table.
      6) Log that operation is completed and log all complete actions
         needed to complete operation from here.
      7) Write bin log.
      8) Prepare handlers for rename and delete of partitions.
      9) Rename and drop the reorged partitions such that they are no
         longer used and rename those added to their real new names.
      10) Install the shadow frm file.
      11) Reopen the table if under lock tables.
      12) Complete query.
    */
    if (write_log_drop_shadow_frm(lpt) ||
        ERROR_INJECT_CRASH("crash_change_partition_1") ||
        ERROR_INJECT_ERROR("fail_change_partition_1") ||
        mysql_write_frm(lpt, WFRM_WRITE_SHADOW) ||
        ERROR_INJECT_CRASH("crash_change_partition_2") ||
        ERROR_INJECT_ERROR("fail_change_partition_2") ||
        write_log_add_change_partition(lpt) ||
        ERROR_INJECT_CRASH("crash_change_partition_3") ||
        ERROR_INJECT_ERROR("fail_change_partition_3") ||
        mysql_change_partitions(lpt) ||
        ERROR_INJECT_CRASH("crash_change_partition_4") ||
        ERROR_INJECT_ERROR("fail_change_partition_4") ||
        wait_while_table_is_used(thd, table, HA_EXTRA_FORCE_REOPEN) ||
        ERROR_INJECT_CRASH("crash_change_partition_5") ||
        ERROR_INJECT_ERROR("fail_change_partition_5") ||
        alter_close_table(lpt) ||
        ERROR_INJECT_CRASH("crash_change_partition_6") ||
        ERROR_INJECT_ERROR("fail_change_partition_6") ||
        write_log_final_change_partition(lpt) ||
        (action_completed= TRUE, FALSE) ||
        ERROR_INJECT_CRASH("crash_change_partition_7") ||
        ERROR_INJECT_ERROR("fail_change_partition_7") ||
        ((!thd->lex->no_write_to_binlog) &&
         (write_bin_log(thd, FALSE,
                        thd->query(), thd->query_length()), FALSE)) ||
        ERROR_INJECT_CRASH("crash_change_partition_8") ||
        ERROR_INJECT_ERROR("fail_change_partition_8") ||
        ((frm_install= TRUE), FALSE) ||
        mysql_write_frm(lpt, WFRM_INSTALL_SHADOW) ||
        (frm_install= FALSE, FALSE) ||
        ERROR_INJECT_CRASH("crash_change_partition_9") ||
        ERROR_INJECT_ERROR("fail_change_partition_9") ||
        mysql_drop_partitions(lpt) ||
        ERROR_INJECT_CRASH("crash_change_partition_10") ||
        ERROR_INJECT_ERROR("fail_change_partition_10") ||
        mysql_rename_partitions(lpt) ||
        ERROR_INJECT_CRASH("crash_change_partition_11") ||
        ERROR_INJECT_ERROR("fail_change_partition_11") ||
        (write_log_completed(lpt, FALSE), FALSE) ||
        ERROR_INJECT_CRASH("crash_change_partition_12") ||
        ERROR_INJECT_ERROR("fail_change_partition_12") ||
        (alter_partition_lock_handling(lpt), FALSE))
    {
      handle_alter_part_error(lpt, action_completed, FALSE, frm_install);
      goto err;
    }
  }
  downgrade_mdl_if_lock_tables_mode(thd, mdl_ticket, MDL_SHARED_NO_READ_WRITE);
  /*
    A final step is to write the query to the binlog and send ok to the
    user
  */
  DBUG_RETURN(fast_end_partition(thd, lpt->copied, lpt->deleted, table_list));
err:
  downgrade_mdl_if_lock_tables_mode(thd, mdl_ticket, MDL_SHARED_NO_READ_WRITE);
  DBUG_RETURN(TRUE);
}