void Optimize_table_order::best_access_path()

in sql/sql_planner.cc [399:1083]


void Optimize_table_order::best_access_path(
                 JOIN_TAB  *s,
                 table_map remaining_tables,
                 uint      idx,
                 bool      disable_jbuf,
                 double    record_count,
                 POSITION *pos,
                 POSITION *loose_scan_pos)
{
  Key_use *best_key=        NULL;
  uint best_max_key_part=   0;
  bool found_constraint=    false;
  double best=              DBL_MAX;
  double best_time=         DBL_MAX;
  double records=           DBL_MAX;
  table_map best_ref_depends_map= 0;
  double tmp;
  bool best_uses_jbuf= false;
  Opt_trace_context * const trace= &thd->opt_trace;

  status_var_increment(thd->status_var.last_query_partial_plans);

  /*
    Cannot use join buffering if either
     1. This is the first table in the join sequence, or
     2. Join buffering is not enabled
        (Only Block Nested Loop is considered in this context)
  */
  disable_jbuf= disable_jbuf ||
    idx == join->const_tables ||                                     // 1
    !thd->optimizer_switch_flag(OPTIMIZER_SWITCH_BNL);               // 2

  Loose_scan_opt loose_scan_opt;
  DBUG_ENTER("Optimize_table_order::best_access_path");

  Opt_trace_object trace_wrapper(trace, "best_access_path");
  Opt_trace_array trace_paths(trace, "considered_access_paths");

  {
    /*
      Loose-scan specific-logic:
      - we must decide whether this is within the dups_producing range.
      - if 'pos' is within the JOIN::positions array, then decide this
      by using the pos[-1] entry.
      - if 'pos' is not in the JOIN::position array then
      in_dups_producing_range must be false (this case may occur in
      semijoin_*_access_paths() which calls best_access_path() with 'pos'
      allocated on the stack).
      @todo One day Loose-scan will be considered in advance_sj_state() only,
      outside best_access_path(), so this complicated logic will not be
      needed.
    */
    const bool in_dups_producing_range=
      (idx == join->const_tables) ?
      false :
      (pos == (join->positions + idx) ?
       (pos[-1].dups_producing_tables != 0) :
       false);
    loose_scan_opt.init(s, remaining_tables, in_dups_producing_range,
                        emb_sjm_nest != NULL);
  }

  /*
    This isn't unlikely at all, but unlikely() cuts 6% CPU time on a 20-table
    search when s->keyuse==0, and has no cost when s->keyuse!=0.
  */
  if (unlikely(s->keyuse != NULL))
  {                                            /* Use key if possible */
    TABLE *const table= s->table;
    double best_records= DBL_MAX;

    /* Test how we can use keys */
    ha_rows rec=
      s->records/MATCHING_ROWS_IN_OTHER_TABLE;  // Assumed records/key
    for (Key_use *keyuse=s->keyuse; keyuse->table == table; )
    {
      key_part_map found_part= 0;
      table_map found_ref= 0;
      const uint key= keyuse->key;
      uint max_key_part= 0;
      KEY *const keyinfo= table->key_info+key;
      const bool ft_key= (keyuse->keypart == FT_KEYPART);
      /* Bitmap of keyparts where the ref access is over 'keypart=const': */
      key_part_map const_part= 0;
      /* The or-null keypart in ref-or-null access: */
      key_part_map ref_or_null_part= 0;
      /// Set dodgy_ref_cost only if that index is chosen for ref access.
      bool is_dodgy= false;

      /* Calculate how many key segments of the current key we can use */
      Key_use *const start_key= keyuse;

      loose_scan_opt.next_ref_key();
      DBUG_PRINT("info", ("Considering ref access on key %s",
                          keyuse->table->key_info[keyuse->key].name));
      Opt_trace_object trace_access_idx(trace);
      trace_access_idx.add_alnum("access_type", "ref").
        add_utf8("index", keyinfo->name);

      // For each keypart
      while (keyuse->table == table && keyuse->key == key)
      {
        const uint keypart= keyuse->keypart;
        table_map best_part_found_ref= 0;
        double best_prev_record_reads= DBL_MAX;

        // For each way to access the keypart
        for ( ; keyuse->table == table && keyuse->key == key &&
                keyuse->keypart == keypart ; ++keyuse)
        {
          /*
            When calculating a plan for a materialized semijoin nest,
            we must not consider key references between tables inside the
            semijoin nest and those outside of it. The same applies to a
            materialized subquery.
          */
          if ((excluded_tables & keyuse->used_tables))
            continue;
          /*
            if 1. expression doesn't refer to forward tables
               2. we won't get two ref-or-null's
          */
          if (!(remaining_tables & keyuse->used_tables) &&
              !(ref_or_null_part && (keyuse->optimize &
                                     KEY_OPTIMIZE_REF_OR_NULL)))
          {
            found_part|= keyuse->keypart_map;
            if (!(keyuse->used_tables & ~join->const_table_map))
              const_part|= keyuse->keypart_map;

            double tmp2= prev_record_reads(join, idx, (found_ref |
                                                      keyuse->used_tables));
            if (tmp2 < best_prev_record_reads)
            {
              best_part_found_ref= keyuse->used_tables & ~join->const_table_map;
              best_prev_record_reads= tmp2;
            }
            if (rec > keyuse->ref_table_rows)
              rec= keyuse->ref_table_rows;
	    /*
	      If there is one 'key_column IS NULL' expression, we can
	      use this ref_or_null optimisation of this field
	    */
            if (keyuse->optimize & KEY_OPTIMIZE_REF_OR_NULL)
              ref_or_null_part |= keyuse->keypart_map;
          }
          loose_scan_opt.add_keyuse(remaining_tables, keyuse);
        }
	found_ref|= best_part_found_ref;
      }

      /*
        Assume that that each key matches a proportional part of table.
      */
      if (!found_part && !ft_key && !loose_scan_opt.have_a_case())
      {
        trace_access_idx.add("usable", false);
        goto done_with_index;                  // Nothing usable found
      }

      if (rec < MATCHING_ROWS_IN_OTHER_TABLE)
        rec= MATCHING_ROWS_IN_OTHER_TABLE;      // Fix for small tables

      /*
        ft-keys require special treatment
      */
      if (ft_key)
      {
        /*
          Really, there should be records=0.0 (yes!)
          but 1.0 would be probably safer
        */
        tmp= prev_record_reads(join, idx, found_ref);
        records= 1.0;
      }
      else
      {
        found_constraint= MY_TEST(found_part);
        loose_scan_opt.check_ref_access_part1(s, key, start_key, found_part);

        /* Check if we found full key */
        if (found_part == LOWER_BITS(key_part_map, actual_key_parts(keyinfo)) &&
            !ref_or_null_part)
        {                                         /* use eq key */
          max_key_part= (uint) ~0;
          if ((keyinfo->flags & (HA_NOSAME | HA_NULL_PART_KEY)) == HA_NOSAME)
          {
            tmp = prev_record_reads(join, idx, found_ref);
            records=1.0;
          }
          else
          {
            if (!found_ref)
            {                                     /* We found a const key */
              /*
                ReuseRangeEstimateForRef-1:
                We get here if we've found a ref(const) (c_i are constants):
                  "(keypart1=c1) AND ... AND (keypartN=cN)"   [ref_const_cond]
                
                If range optimizer was able to construct a "range" 
                access on this index, then its condition "quick_cond" was
                eqivalent to ref_const_cond (*), and we can re-use E(#rows)
                from the range optimizer.
                
                Proof of (*): By properties of range and ref optimizers 
                quick_cond will be equal or tighther than ref_const_cond. 
                ref_const_cond already covers "smallest" possible interval - 
                a singlepoint interval over all keyparts. Therefore, 
                quick_cond is equivalent to ref_const_cond (if it was an 
                empty interval we wouldn't have got here).
              */
              if (table->quick_keys.is_set(key))
                records= (double) table->quick_rows[key];
              else
              {
                /* quick_range couldn't use key! */
                records= (double) s->records/rec;
              }
            }
            else
            {
              if (!(records= keyinfo->rec_per_key[actual_key_parts(keyinfo)-1]))
              {                                   /* Prefer longer keys */
                records=
                  ((double) s->records / (double) rec *
                   (1.0 +
                    ((double) (table->s->max_key_length-keyinfo->key_length) /
                     (double) table->s->max_key_length)));
                if (records < 2.0)
                  records=2.0;               /* Can't be as good as a unique */
              }
              /*
                ReuseRangeEstimateForRef-2:  We get here if we could not reuse
                E(#rows) from range optimizer. Make another try:
                
                If range optimizer produced E(#rows) for a prefix of the ref
                access we're considering, and that E(#rows) is lower then our
                current estimate, make an adjustment. The criteria of when we
                can make an adjustment is a special case of the criteria used
                in ReuseRangeEstimateForRef-3.
              */
              if (table->quick_keys.is_set(key) &&
                  (const_part &
                    (((key_part_map)1 << table->quick_key_parts[key])-1)) ==
                  (((key_part_map)1 << table->quick_key_parts[key])-1) &&
                  table->quick_n_ranges[key] == 1 &&
                  records > (double) table->quick_rows[key])
              {
                records= (double) table->quick_rows[key];
              }
            }
            /* Limit the number of matched rows */
            tmp= records;
            set_if_smaller(tmp, (double) thd->variables.max_seeks_for_key);
            if (table->covering_keys.is_set(key))
            {
              /* we can use only index tree */
              tmp= record_count * table->file->index_only_read_time(key, tmp);
            }
            else
              tmp= record_count*min(tmp,s->worst_seeks);
          }
        }
        else
        {
          /*
            Use as much key-parts as possible and a uniq key is better
            than a not unique key
            Set tmp to (previous record count) * (records / combination)
          */
          if ((found_part & 1) &&
              (!(table->file->index_flags(key, 0, 0) & HA_ONLY_WHOLE_INDEX) ||
               found_part == LOWER_BITS(key_part_map,
                                        actual_key_parts(keyinfo))))
          {
            max_key_part= max_part_bit(found_part);
            /*
              ReuseRangeEstimateForRef-3:
              We're now considering a ref[or_null] access via
              (t.keypart1=e1 AND ... AND t.keypartK=eK) [ OR  
              (same-as-above but with one cond replaced 
               with "t.keypart_i IS NULL")]  (**)
              
              Try re-using E(#rows) from "range" optimizer:
              We can do so if "range" optimizer used the same intervals as
              in (**). The intervals used by range optimizer may be not 
              available at this point (as "range" access might have choosen to
              create quick select over another index), so we can't compare
              them to (**). We'll make indirect judgements instead.
              The sufficient conditions for re-use are:
              (C1) All e_i in (**) are constants, i.e. found_ref==FALSE. (if
                   this is not satisfied we have no way to know which ranges
                   will be actually scanned by 'ref' until we execute the 
                   join)
              (C2) max #key parts in 'range' access == K == max_key_part (this
                   is apparently a necessary requirement)

              We also have a property that "range optimizer produces equal or 
              tighter set of scan intervals than ref(const) optimizer". Each
              of the intervals in (**) are "tightest possible" intervals when 
              one limits itself to using keyparts 1..K (which we do in #2).              
              From here it follows that range access used either one, or
              both of the (I1) and (I2) intervals:
              
               (t.keypart1=c1 AND ... AND t.keypartK=eK)  (I1) 
               (same-as-above but with one cond replaced  
                with "t.keypart_i IS NULL")               (I2)

              The remaining part is to exclude the situation where range
              optimizer used one interval while we're considering
              ref-or-null and looking for estimate for two intervals. This
              is done by last limitation:

              (C3) "range optimizer used (have ref_or_null?2:1) intervals"
            */
            if (table->quick_keys.is_set(key) && !found_ref &&          //(C1)
                table->quick_key_parts[key] == max_key_part &&          //(C2)
                table->quick_n_ranges[key] == 1+MY_TEST(ref_or_null_part)) //(C3)
            {
              tmp= records= (double) table->quick_rows[key];
            }
            else
            {
              /* Check if we have statistic about the distribution */
              if ((records= keyinfo->rec_per_key[max_key_part-1]))
              {
                /* 
                  Fix for the case where the index statistics is too
                  optimistic: If 
                  (1) We're considering ref(const) and there is quick select
                      on the same index, 
                  (2) and that quick select uses more keyparts (i.e. it will
                      scan equal/smaller interval then this ref(const))
                  (3) and E(#rows) for quick select is higher then our
                      estimate,
                  Then 
                    We'll use E(#rows) from quick select.

                  One observation is that when there are multiple
                  indexes with a common prefix (eg (b) and (b, c)) we
                  are not always selecting (b, c) even when this can
                  use more keyparts. Inaccuracies in statistics from
                  the storage engines can cause the record estimate
                  for the quick object for (b) to be lower than the
                  record estimate for the quick object for (b,c).

                  Q: Why do we choose to use 'ref'? Won't quick select be
                  cheaper in some cases ?
                  TODO: figure this out and adjust the plan choice if needed.
                */
                if (!found_ref && table->quick_keys.is_set(key) &&    // (1)
                    table->quick_key_parts[key] > max_key_part &&     // (2)
                    records < (double)table->quick_rows[key])         // (3)
                {
                  records= (double)table->quick_rows[key];
                  is_dodgy= true;
                }

                tmp= records;
              }
              else
              {
                /*
                  Assume that the first key part matches 1% of the file
                  and that the whole key matches 10 (duplicates) or 1
                  (unique) records.
                  Assume also that more key matches proportionally more
                  records
                  This gives the formula:
                  records = (x * (b-a) + a*c-b)/(c-1)

                  b = records matched by whole key
                  a = records matched by first key part (1% of all records?)
                  c = number of key parts in key
                  x = used key parts (1 <= x <= c)
                */
                double rec_per_key;
                if (!(rec_per_key=(double)
                      keyinfo->rec_per_key[keyinfo->user_defined_key_parts-1]))
                  rec_per_key=(double) s->records/rec+1;

                if (!s->records)
                  tmp = 0;
                else if (rec_per_key/(double) s->records >= 0.01)
                  tmp = rec_per_key;
                else
                {
                  double a=s->records*0.01;
                  if (keyinfo->user_defined_key_parts > 1)
                    tmp= (max_key_part * (rec_per_key - a) +
                          a * keyinfo->user_defined_key_parts - rec_per_key) /
                         (keyinfo->user_defined_key_parts - 1);
                  else
                    tmp= a;
                  set_if_bigger(tmp,1.0);
                }
                records = (ulong) tmp;
              }

              if (ref_or_null_part)
              {
                /* We need to do two key searches to find key */
                tmp *= 2.0;
                records *= 2.0;
              }

              /*
                ReuseRangeEstimateForRef-4:  We get here if we could not reuse
                E(#rows) from range optimizer. Make another try:
                
                If range optimizer produced E(#rows) for a prefix of the ref 
                access we're considering, and that E(#rows) is lower then our
                current estimate, make the adjustment.

                The decision whether we can re-use the estimate from the range
                optimizer is the same as in ReuseRangeEstimateForRef-3,
                applied to first table->quick_key_parts[key] key parts.
              */
              if (table->quick_keys.is_set(key) &&
                  table->quick_key_parts[key] <= max_key_part &&
                  const_part &
                    ((key_part_map)1 << table->quick_key_parts[key]) &&
                  table->quick_n_ranges[key] == 1 + MY_TEST(ref_or_null_part &
                                                            const_part) &&
                  records > (double) table->quick_rows[key])
              {
                tmp= records= (double) table->quick_rows[key];
              }
            }

            /* Limit the number of matched rows */
            set_if_smaller(tmp, (double) thd->variables.max_seeks_for_key);
            if (table->covering_keys.is_set(key))
            {
              /* we can use only index tree */
              tmp= record_count * table->file->index_only_read_time(key, tmp);
            }
            else
              tmp= record_count * min(tmp,s->worst_seeks);
          }
          else
            tmp= best_time;                    // Do nothing
        }
        // {semijoin LooseScan + ref} is disabled
#if 0
        loose_scan_opt.check_ref_access_part2(key, start_key, records, tmp);
#endif

      } /* not ft_key */

      {
        const double idx_time= tmp + records * ROW_EVALUATE_COST;
        trace_access_idx.add("rows", records).add("cost", idx_time);
        if (idx_time < best_time)
        {
          best_time= idx_time;
          best= tmp;
          best_records= records;
          best_key= start_key;
          best_max_key_part= max_key_part;
          best_ref_depends_map= found_ref;
        }
      }
  done_with_index:
      bool chosen= (best_key == start_key);
      trace_access_idx.add("chosen", chosen);
      if (chosen)
        s->dodgy_ref_cost= is_dodgy;
    } /* for each key */
    records= best_records;
  }

  Opt_trace_object trace_access_scan(trace);
  /*
    Don't test table scan if it can't be better.
    Prefer key lookup if we would use the same key for scanning.

    Don't do a table scan on InnoDB tables, if we can read the used
    parts of the row from any of the used index.
    This is because table scans uses index and we would not win
    anything by using a table scan. The only exception is INDEX_MERGE
    quick select. We can not say for sure that INDEX_MERGE quick select
    is always faster than ref access. So it's necessary to check if
    ref access is more expensive.

    A word for word translation of the below if-statement in sergefp's
    understanding: we check if we should use table scan if:
    (1) The found 'ref' access produces more records than a table scan
        (or index scan, or quick select), or 'ref' is more expensive than
        any of them.
    (2) The best way to perform table or index scan is to use 'range' access
        using index IDX. If it is a 'tight range' scan (i.e not a loose index
        scan' or 'index merge'), then ref access on the same index will
        perform equal or better if ref access can use the same or more number
        of key parts.
    (3) See above note about InnoDB.
    (4) NOT ("FORCE INDEX(...)" is used for table and there is 'ref' access
             path, but there is no quick select)
        If the condition in the above brackets holds, then the only possible
        "table scan" access method is ALL/index (there is no quick select).
        Since we have a 'ref' access path, and FORCE INDEX instructs us to
        choose it over ALL/index, there is no need to consider a full table
        scan.
  */
  if (!(records >= s->found_records || best > s->read_time))            // (1)
  {
    // "scan" means (full) index scan or (full) table scan.
    trace_access_scan.add_alnum("access_type", s->quick ? "range" : "scan").
      add("cost", s->read_time + s->found_records * ROW_EVALUATE_COST).
      add("rows", s->found_records).
      add_alnum("cause", "cost");

    goto skip_table_scan;
  }

  if ((s->quick && best_key && s->quick->index == best_key->key &&      // (2)
       best_max_key_part >= s->table->quick_key_parts[best_key->key]) &&  // (2)
      (s->quick->get_type() !=
       QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX) &&                      // (2)
      (s->quick->get_type() !=
       QUICK_SELECT_I::QS_TYPE_SKIP_SCAN))                      // (2)
  {
    trace_access_scan.add_alnum("access_type", "range").
      add_alnum("cause", "heuristic_index_cheaper");
    goto skip_table_scan;
  }

  if ((s->table->file->ha_table_flags() & HA_TABLE_SCAN_ON_INDEX) &&    //(3)
      best_key &&                                                       //(3)
      (!s->table->covering_keys.is_clear_all() ||
       (s->table->file->primary_key_is_clustered() &&
        s->table->s->primary_key == best_key->key)) &&                  //(3)
      (!s->quick ||                                                     //(3)
       (s->quick->get_type() == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT &&//(3)
        best < s->quick->read_time)))                                   //(3)
  {
    trace_access_scan.add_alnum("access_type", s->quick ? "range" : "scan").
      add_alnum("cause", "covering_index_better_than_full_scan");
    goto skip_table_scan;
  }

  if ((s->table->force_index && best_key && !s->quick))                 // (4)
  {
      trace_access_scan.add_alnum("access_type", "scan").
        add_alnum("cause", "force_index");
    goto skip_table_scan;
  }

  {                                             // Check full join
    ha_rows rnd_records= s->found_records;
    /*
      If there is a filtering condition on the table (i.e. ref analyzer found
      at least one "table.keyXpartY= exprZ", where exprZ refers only to tables
      preceding this table in the join order we're now considering), then 
      assume that 25% of the rows will be filtered out by this condition.

      This heuristic is supposed to force tables used in exprZ to be before
      this table in join order.
    */
    if (found_constraint)
      rnd_records-= rnd_records/4;

    /*
      If applicable, get a more accurate estimate. Don't use the two
      heuristics at once.
    */
    if (s->table->quick_condition_rows != s->found_records)
      rnd_records= s->table->quick_condition_rows;

    /*
      Range optimizer never proposes a RANGE if it isn't better
      than FULL: so if RANGE is present, it's always preferred to FULL.
      Here we estimate its cost.
    */

    if (s->quick)
    {
      trace_access_scan.add_alnum("access_type", "range");
      /*
        For each record we:
        - read record range through 'quick'
        - skip rows which does not satisfy WHERE constraints
        TODO: 
        We take into account possible use of join cache for ALL/index
        access (see first else-branch below), but we don't take it into 
        account here for range/index_merge access. Find out why this is so.
      */
      tmp= record_count *
        (s->quick->read_time +
         (s->found_records - rnd_records) * ROW_EVALUATE_COST);

      loose_scan_opt.check_range_access(join, idx, s->quick);
    }
    else
    {
      trace_access_scan.add_alnum("access_type", "scan");
      /* Estimate cost of reading table. */
      if (s->table->force_index && !best_key) // index scan
        tmp= s->table->file->read_time(s->ref.key, 1, s->records);
      else // table scan
        tmp= s->table->file->scan_time();

      if (disable_jbuf)
      {
        /*
          For each record we have to:
          - read the whole table record 
          - skip rows which does not satisfy join condition
        */
        tmp= record_count *
             (tmp + (s->records - rnd_records) * ROW_EVALUATE_COST);
      }
      else
      {
        trace_access_scan.add("using_join_cache", true);
        /*
          We read the table as many times as join buffer becomes full.
          It would be more exact to round the result of the division with
          floor(), but that takes 5% of time in a 20-table query plan search.
        */
        tmp*= (1.0 + ((double) cache_record_length(join,idx) *
                      record_count /
                      (double) thd->variables.join_buff_size));
        /* 
            We don't make full cartesian product between rows in the scanned
           table and existing records because we skip all rows from the
           scanned table, which does not satisfy join condition when 
           we read the table (see flush_cached_records for details). Here we
           take into account cost to read and skip these records.
        */
        tmp+= (s->records - rnd_records) * ROW_EVALUATE_COST;
      }
    }

    const double scan_cost=
      tmp + (record_count * ROW_EVALUATE_COST * rnd_records);

    trace_access_scan.add("rows", rows2double(rnd_records)).
      add("cost", scan_cost);
    /*
      We estimate the cost of evaluating WHERE clause for found records
      as record_count * rnd_records * ROW_EVALUATE_COST. This cost plus
      tmp give us total cost of using TABLE SCAN
    */
    if (best == DBL_MAX ||
        (scan_cost < best + (record_count * ROW_EVALUATE_COST * records)))
    {
      /*
        If the table has a range (s->quick is set) make_join_select()
        will ensure that this will be used
      */
      best= tmp;
      records= rows2double(rnd_records);
      best_key= 0;
      /* range/index_merge/ALL/index access method are "independent", so: */
      best_ref_depends_map= 0;
      best_uses_jbuf= MY_TEST(!disable_jbuf);
    }
  }

skip_table_scan:
  trace_access_scan.add("chosen", best_key == NULL);

  /* Update the cost information for the current partial plan */
  pos->records_read= records;
  pos->read_time=    best;
  pos->key=          best_key;
  pos->table=        s;
  pos->ref_depend_map= best_ref_depends_map;
  pos->loosescan_key= MAX_KEY;
  pos->use_join_buffer= best_uses_jbuf;

  loose_scan_opt.save_to_position(s, loose_scan_pos);

  if (!best_key &&
      idx == join->const_tables &&
      s->table == join->sort_by_table &&
      join->unit->select_limit_cnt >= records)
  {
    trace_access_scan.add("use_tmp_table", true);
    join->sort_by_table= (TABLE*) 1;  // Must use temporary table
  }

  DBUG_VOID_RETURN;
}