int __kmp_dispatch_next_algorithm()

in openmp/runtime/src/kmp_dispatch.cpp [1161:1951]


int __kmp_dispatch_next_algorithm(int gtid,
                                  dispatch_private_info_template<T> *pr,
                                  dispatch_shared_info_template<T> volatile *sh,
                                  kmp_int32 *p_last, T *p_lb, T *p_ub,
                                  typename traits_t<T>::signed_t *p_st, T nproc,
                                  T tid) {
  typedef typename traits_t<T>::unsigned_t UT;
  typedef typename traits_t<T>::signed_t ST;
  typedef typename traits_t<T>::floating_t DBL;
  int status = 0;
  bool last = false;
  T start;
  ST incr;
  UT limit, trip, init;
  kmp_info_t *th = __kmp_threads[gtid];
  kmp_team_t *team = th->th.th_team;

  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
  KMP_DEBUG_ASSERT(pr);
  KMP_DEBUG_ASSERT(sh);
  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
#ifdef KMP_DEBUG
  {
    char *buff;
    // create format specifiers before the debug output
    buff =
        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
                         "sh:%%p nproc:%%%s tid:%%%s\n",
                         traits_t<T>::spec, traits_t<T>::spec);
    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
    __kmp_str_free(&buff);
  }
#endif

  // zero trip count
  if (pr->u.p.tc == 0) {
    KD_TRACE(10,
             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
              "zero status:%d\n",
              gtid, status));
    return 0;
  }

  switch (pr->schedule) {
#if KMP_STATIC_STEAL_ENABLED
  case kmp_sch_static_steal: {
    T chunk = pr->u.p.parm1;
    UT nchunks = pr->u.p.parm2;
    KD_TRACE(100,
             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
              gtid));

    trip = pr->u.p.tc - 1;

    if (traits_t<T>::type_size > 4) {
      // use lock for 8-byte induction variable.
      // TODO (optional): check presence and use 16-byte CAS
      kmp_lock_t *lck = pr->u.p.steal_lock;
      KMP_DEBUG_ASSERT(lck != NULL);
      if (pr->u.p.count < (UT)pr->u.p.ub) {
        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
        __kmp_acquire_lock(lck, gtid);
        // try to get own chunk of iterations
        init = (pr->u.p.count)++;
        status = (init < (UT)pr->u.p.ub);
        __kmp_release_lock(lck, gtid);
      } else {
        status = 0; // no own chunks
      }
      if (!status) { // try to steal
        kmp_lock_t *lckv; // victim buffer's lock
        T while_limit = pr->u.p.parm3;
        T while_index = 0;
        int idx = (th->th.th_dispatch->th_disp_index - 1) %
                  __kmp_dispatch_num_buffers; // current loop index
        // note: victim thread can potentially execute another loop
        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
        while ((!status) && (while_limit != ++while_index)) {
          dispatch_private_info_template<T> *v;
          T remaining;
          T victimId = pr->u.p.parm4;
          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
          v = reinterpret_cast<dispatch_private_info_template<T> *>(
              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
          KMP_DEBUG_ASSERT(v);
          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
                 oldVictimId != victimId) {
            victimId = (victimId + 1) % nproc;
            v = reinterpret_cast<dispatch_private_info_template<T> *>(
                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
            KMP_DEBUG_ASSERT(v);
          }
          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
            continue; // try once more (nproc attempts in total)
          }
          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
            kmp_uint32 old = UNUSED;
            // try to steal whole range from inactive victim
            status = v->steal_flag.compare_exchange_strong(old, THIEF);
            if (status) {
              // initialize self buffer with victim's whole range of chunks
              T id = victimId;
              T small_chunk, extras;
              small_chunk = nchunks / nproc; // chunks per thread
              extras = nchunks % nproc;
              init = id * small_chunk + (id < extras ? id : extras);
              __kmp_acquire_lock(lck, gtid);
              pr->u.p.count = init + 1; // exclude one we execute immediately
              pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
              __kmp_release_lock(lck, gtid);
              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
              // no need to reinitialize other thread invariants: lb, st, etc.
#ifdef KMP_DEBUG
              {
                char *buff;
                // create format specifiers before the debug output
                buff = __kmp_str_format(
                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
                    "count:%%%s ub:%%%s\n",
                    traits_t<UT>::spec, traits_t<T>::spec);
                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
                __kmp_str_free(&buff);
              }
#endif
              // activate non-empty buffer and let others steal from us
              if (pr->u.p.count < (UT)pr->u.p.ub)
                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
              break;
            }
          }
          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
              v->u.p.count >= (UT)v->u.p.ub) {
            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
            continue; // no chunks to steal, try next victim
          }
          lckv = v->u.p.steal_lock;
          KMP_ASSERT(lckv != NULL);
          __kmp_acquire_lock(lckv, gtid);
          limit = v->u.p.ub; // keep initial ub
          if (v->u.p.count >= limit) {
            __kmp_release_lock(lckv, gtid);
            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
            continue; // no chunks to steal, try next victim
          }

          // stealing succeded, reduce victim's ub by 1/4 of undone chunks
          // TODO: is this heuristics good enough??
          remaining = limit - v->u.p.count;
          if (remaining > 7) {
            // steal 1/4 of remaining
            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
            init = (v->u.p.ub -= (remaining >> 2));
          } else {
            // steal 1 chunk of 1..7 remaining
            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
            init = (v->u.p.ub -= 1);
          }
          __kmp_release_lock(lckv, gtid);
#ifdef KMP_DEBUG
          {
            char *buff;
            // create format specifiers before the debug output
            buff = __kmp_str_format(
                "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
                "count:%%%s ub:%%%s\n",
                traits_t<UT>::spec, traits_t<UT>::spec);
            KD_TRACE(10, (buff, gtid, victimId, init, limit));
            __kmp_str_free(&buff);
          }
#endif
          KMP_DEBUG_ASSERT(init + 1 <= limit);
          pr->u.p.parm4 = victimId; // remember victim to steal from
          status = 1;
          // now update own count and ub with stolen range excluding init chunk
          __kmp_acquire_lock(lck, gtid);
          pr->u.p.count = init + 1;
          pr->u.p.ub = limit;
          __kmp_release_lock(lck, gtid);
          // activate non-empty buffer and let others steal from us
          if (init + 1 < limit)
            KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
        } // while (search for victim)
      } // if (try to find victim and steal)
    } else {
      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
      // as all operations on pair (count, ub) must be done atomically
      typedef union {
        struct {
          UT count;
          T ub;
        } p;
        kmp_int64 b;
      } union_i4;
      union_i4 vold, vnew;
      if (pr->u.p.count < (UT)pr->u.p.ub) {
        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
        vnew.b = vold.b;
        vnew.p.count++; // get chunk from head of self range
        while (!KMP_COMPARE_AND_STORE_REL64(
            (volatile kmp_int64 *)&pr->u.p.count,
            *VOLATILE_CAST(kmp_int64 *) & vold.b,
            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
          KMP_CPU_PAUSE();
          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
          vnew.b = vold.b;
          vnew.p.count++;
        }
        init = vold.p.count;
        status = (init < (UT)vold.p.ub);
      } else {
        status = 0; // no own chunks
      }
      if (!status) { // try to steal
        T while_limit = pr->u.p.parm3;
        T while_index = 0;
        int idx = (th->th.th_dispatch->th_disp_index - 1) %
                  __kmp_dispatch_num_buffers; // current loop index
        // note: victim thread can potentially execute another loop
        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
        while ((!status) && (while_limit != ++while_index)) {
          dispatch_private_info_template<T> *v;
          T remaining;
          T victimId = pr->u.p.parm4;
          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
          v = reinterpret_cast<dispatch_private_info_template<T> *>(
              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
          KMP_DEBUG_ASSERT(v);
          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
                 oldVictimId != victimId) {
            victimId = (victimId + 1) % nproc;
            v = reinterpret_cast<dispatch_private_info_template<T> *>(
                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
            KMP_DEBUG_ASSERT(v);
          }
          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
            continue; // try once more (nproc attempts in total)
          }
          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
            kmp_uint32 old = UNUSED;
            // try to steal whole range from inactive victim
            status = v->steal_flag.compare_exchange_strong(old, THIEF);
            if (status) {
              // initialize self buffer with victim's whole range of chunks
              T id = victimId;
              T small_chunk, extras;
              small_chunk = nchunks / nproc; // chunks per thread
              extras = nchunks % nproc;
              init = id * small_chunk + (id < extras ? id : extras);
              vnew.p.count = init + 1;
              vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
              // write pair (count, ub) at once atomically
#if KMP_ARCH_X86
              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
#else
              *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
#endif
              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
              // no need to initialize other thread invariants: lb, st, etc.
#ifdef KMP_DEBUG
              {
                char *buff;
                // create format specifiers before the debug output
                buff = __kmp_str_format(
                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
                    "count:%%%s ub:%%%s\n",
                    traits_t<UT>::spec, traits_t<T>::spec);
                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
                __kmp_str_free(&buff);
              }
#endif
              // activate non-empty buffer and let others steal from us
              if (pr->u.p.count < (UT)pr->u.p.ub)
                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
              break;
            }
          }
          while (1) { // CAS loop with check if victim still has enough chunks
            // many threads may be stealing concurrently from same victim
            vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
            if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
                vold.p.count >= (UT)vold.p.ub) {
              pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
              break; // no chunks to steal, try next victim
            }
            vnew.b = vold.b;
            remaining = vold.p.ub - vold.p.count;
            // try to steal 1/4 of remaining
            // TODO: is this heuristics good enough??
            if (remaining > 7) {
              vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
            } else {
              vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
            }
            KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
            if (KMP_COMPARE_AND_STORE_REL64(
                    (volatile kmp_int64 *)&v->u.p.count,
                    *VOLATILE_CAST(kmp_int64 *) & vold.b,
                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
              // stealing succedded
#ifdef KMP_DEBUG
              {
                char *buff;
                // create format specifiers before the debug output
                buff = __kmp_str_format(
                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
                    "count:%%%s ub:%%%s\n",
                    traits_t<T>::spec, traits_t<T>::spec);
                KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
                __kmp_str_free(&buff);
              }
#endif
              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
                                        vold.p.ub - vnew.p.ub);
              status = 1;
              pr->u.p.parm4 = victimId; // keep victim id
              // now update own count and ub
              init = vnew.p.ub;
              vold.p.count = init + 1;
#if KMP_ARCH_X86
              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
#else
              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
#endif
              // activate non-empty buffer and let others steal from us
              if (vold.p.count < (UT)vold.p.ub)
                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
              break;
            } // if (check CAS result)
            KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
          } // while (try to steal from particular victim)
        } // while (search for victim)
      } // if (try to find victim and steal)
    } // if (4-byte induction variable)
    if (!status) {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    } else {
      start = pr->u.p.lb;
      init *= chunk;
      limit = chunk + init - 1;
      incr = pr->u.p.st;
      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);

      KMP_DEBUG_ASSERT(init <= trip);
      // keep track of done chunks for possible early exit from stealing
      // TODO: count executed chunks locally with rare update of shared location
      // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
      if ((last = (limit >= trip)) != 0)
        limit = trip;
      if (p_st != NULL)
        *p_st = incr;

      if (incr == 1) {
        *p_lb = start + init;
        *p_ub = start + limit;
      } else {
        *p_lb = start + init * incr;
        *p_ub = start + limit * incr;
      }
    } // if
    break;
  } // case
#endif // KMP_STATIC_STEAL_ENABLED
  case kmp_sch_static_balanced: {
    KD_TRACE(
        10,
        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
         gtid));
    /* check if thread has any iteration to do */
    if ((status = !pr->u.p.count) != 0) {
      pr->u.p.count = 1;
      *p_lb = pr->u.p.lb;
      *p_ub = pr->u.p.ub;
      last = (pr->u.p.parm1 != 0);
      if (p_st != NULL)
        *p_st = pr->u.p.st;
    } else { /* no iterations to do */
      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
    }
  } // case
  break;
  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
                                 merged here */
  case kmp_sch_static_chunked: {
    T parm1;

    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
                   "kmp_sch_static_[affinity|chunked] case\n",
                   gtid));
    parm1 = pr->u.p.parm1;

    trip = pr->u.p.tc - 1;
    init = parm1 * (pr->u.p.count + tid);

    if ((status = (init <= trip)) != 0) {
      start = pr->u.p.lb;
      incr = pr->u.p.st;
      limit = parm1 + init - 1;

      if ((last = (limit >= trip)) != 0)
        limit = trip;

      if (p_st != NULL)
        *p_st = incr;

      pr->u.p.count += nproc;

      if (incr == 1) {
        *p_lb = start + init;
        *p_ub = start + limit;
      } else {
        *p_lb = start + init * incr;
        *p_ub = start + limit * incr;
      }

      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      } // if
    } // if
  } // case
  break;

  case kmp_sch_dynamic_chunked: {
    UT chunk_number;
    UT chunk_size = pr->u.p.parm1;
    UT nchunks = pr->u.p.parm2;

    KD_TRACE(
        100,
        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
         gtid));

    chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
    status = (chunk_number < nchunks);
    if (!status) {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    } else {
      init = chunk_size * chunk_number;
      trip = pr->u.p.tc - 1;
      start = pr->u.p.lb;
      incr = pr->u.p.st;

      if ((last = (trip - init < (UT)chunk_size)))
        limit = trip;
      else
        limit = chunk_size + init - 1;

      if (p_st != NULL)
        *p_st = incr;

      if (incr == 1) {
        *p_lb = start + init;
        *p_ub = start + limit;
      } else {
        *p_lb = start + init * incr;
        *p_ub = start + limit * incr;
      }

      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      } // if
    } // if
  } // case
  break;

  case kmp_sch_guided_iterative_chunked: {
    T chunkspec = pr->u.p.parm1;
    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
                   "iterative case\n",
                   gtid));
    trip = pr->u.p.tc;
    // Start atomic part of calculations
    while (1) {
      ST remaining; // signed, because can be < 0
      init = sh->u.s.iteration; // shared value
      remaining = trip - init;
      if (remaining <= 0) { // AC: need to compare with 0 first
        // nothing to do, don't try atomic op
        status = 0;
        break;
      }
      if ((T)remaining <
          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
        // use dynamic-style schedule
        // atomically increment iterations, get old value
        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                                 (ST)chunkspec);
        remaining = trip - init;
        if (remaining <= 0) {
          status = 0; // all iterations got by other threads
        } else {
          // got some iterations to work on
          status = 1;
          if ((T)remaining > chunkspec) {
            limit = init + chunkspec - 1;
          } else {
            last = true; // the last chunk
            limit = init + remaining - 1;
          } // if
        } // if
        break;
      } // if
      limit = init + (UT)((double)remaining *
                          *(double *)&pr->u.p.parm3); // divide by K*nproc
      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                               (ST)init, (ST)limit)) {
        // CAS was successful, chunk obtained
        status = 1;
        --limit;
        break;
      } // if
    } // while
    if (status != 0) {
      start = pr->u.p.lb;
      incr = pr->u.p.st;
      if (p_st != NULL)
        *p_st = incr;
      *p_lb = start + init * incr;
      *p_ub = start + limit * incr;
      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      } // if
    } else {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    } // if
  } // case
  break;

  case kmp_sch_guided_simd: {
    // same as iterative but curr-chunk adjusted to be multiple of given
    // chunk
    T chunk = pr->u.p.parm1;
    KD_TRACE(100,
             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
              gtid));
    trip = pr->u.p.tc;
    // Start atomic part of calculations
    while (1) {
      ST remaining; // signed, because can be < 0
      init = sh->u.s.iteration; // shared value
      remaining = trip - init;
      if (remaining <= 0) { // AC: need to compare with 0 first
        status = 0; // nothing to do, don't try atomic op
        break;
      }
      KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
      // compare with K*nproc*(chunk+1), K=2 by default
      if ((T)remaining < pr->u.p.parm2) {
        // use dynamic-style schedule
        // atomically increment iterations, get old value
        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                                 (ST)chunk);
        remaining = trip - init;
        if (remaining <= 0) {
          status = 0; // all iterations got by other threads
        } else {
          // got some iterations to work on
          status = 1;
          if ((T)remaining > chunk) {
            limit = init + chunk - 1;
          } else {
            last = true; // the last chunk
            limit = init + remaining - 1;
          } // if
        } // if
        break;
      } // if
      // divide by K*nproc
      UT span;
      __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
                         &span);
      UT rem = span % chunk;
      if (rem) // adjust so that span%chunk == 0
        span += chunk - rem;
      limit = init + span;
      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                               (ST)init, (ST)limit)) {
        // CAS was successful, chunk obtained
        status = 1;
        --limit;
        break;
      } // if
    } // while
    if (status != 0) {
      start = pr->u.p.lb;
      incr = pr->u.p.st;
      if (p_st != NULL)
        *p_st = incr;
      *p_lb = start + init * incr;
      *p_ub = start + limit * incr;
      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      } // if
    } else {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    } // if
  } // case
  break;

  case kmp_sch_guided_analytical_chunked: {
    T chunkspec = pr->u.p.parm1;
    UT chunkIdx;
#if KMP_USE_X87CONTROL
    /* for storing original FPCW value for Windows* OS on
       IA-32 architecture 8-byte version */
    unsigned int oldFpcw;
    unsigned int fpcwSet = 0;
#endif
    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
                   "kmp_sch_guided_analytical_chunked case\n",
                   gtid));

    trip = pr->u.p.tc;

    KMP_DEBUG_ASSERT(nproc > 1);
    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);

    while (1) { /* this while loop is a safeguard against unexpected zero
                   chunk sizes */
      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
      if (chunkIdx >= (UT)pr->u.p.parm2) {
        --trip;
        /* use dynamic-style scheduling */
        init = chunkIdx * chunkspec + pr->u.p.count;
        /* need to verify init > 0 in case of overflow in the above
         * calculation */
        if ((status = (init > 0 && init <= trip)) != 0) {
          limit = init + chunkspec - 1;

          if ((last = (limit >= trip)) != 0)
            limit = trip;
        }
        break;
      } else {
/* use exponential-style scheduling */
/* The following check is to workaround the lack of long double precision on
   Windows* OS.
   This check works around the possible effect that init != 0 for chunkIdx == 0.
 */
#if KMP_USE_X87CONTROL
        /* If we haven't already done so, save original
           FPCW and set precision to 64-bit, as Windows* OS
           on IA-32 architecture defaults to 53-bit */
        if (!fpcwSet) {
          oldFpcw = _control87(0, 0);
          _control87(_PC_64, _MCW_PC);
          fpcwSet = 0x30000;
        }
#endif
        if (chunkIdx) {
          init = __kmp_dispatch_guided_remaining<T>(
              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
          KMP_DEBUG_ASSERT(init);
          init = trip - init;
        } else
          init = 0;
        limit = trip - __kmp_dispatch_guided_remaining<T>(
                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
        KMP_ASSERT(init <= limit);
        if (init < limit) {
          KMP_DEBUG_ASSERT(limit <= trip);
          --limit;
          status = 1;
          break;
        } // if
      } // if
    } // while (1)
#if KMP_USE_X87CONTROL
    /* restore FPCW if necessary
       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
    */
    if (fpcwSet && (oldFpcw & fpcwSet))
      _control87(oldFpcw, _MCW_PC);
#endif
    if (status != 0) {
      start = pr->u.p.lb;
      incr = pr->u.p.st;
      if (p_st != NULL)
        *p_st = incr;
      *p_lb = start + init * incr;
      *p_ub = start + limit * incr;
      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      }
    } else {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    }
  } // case
  break;

  case kmp_sch_trapezoidal: {
    UT index;
    T parm2 = pr->u.p.parm2;
    T parm3 = pr->u.p.parm3;
    T parm4 = pr->u.p.parm4;
    KD_TRACE(100,
             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
              gtid));

    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);

    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
    trip = pr->u.p.tc - 1;

    if ((status = ((T)index < parm3 && init <= trip)) == 0) {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    } else {
      start = pr->u.p.lb;
      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
      incr = pr->u.p.st;

      if ((last = (limit >= trip)) != 0)
        limit = trip;

      if (p_st != NULL)
        *p_st = incr;

      if (incr == 1) {
        *p_lb = start + init;
        *p_ub = start + limit;
      } else {
        *p_lb = start + init * incr;
        *p_ub = start + limit * incr;
      }

      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      } // if
    } // if
  } // case
  break;
  default: {
    status = 0; // to avoid complaints on uninitialized variable use
    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
                KMP_HNT(GetNewerLibrary), // Hint
                __kmp_msg_null // Variadic argument list terminator
    );
  } break;
  } // switch
  if (p_last)
    *p_last = last;
#ifdef KMP_DEBUG
  if (pr->flags.ordered) {
    char *buff;
    // create format specifiers before the debug output
    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
                            "ordered_lower:%%%s ordered_upper:%%%s\n",
                            traits_t<UT>::spec, traits_t<UT>::spec);
    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
    __kmp_str_free(&buff);
  }
  {
    char *buff;
    // create format specifiers before the debug output
    buff = __kmp_str_format(
        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
    KMP_DEBUG_ASSERT(p_last);
    KMP_DEBUG_ASSERT(p_st);
    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
    __kmp_str_free(&buff);
  }
#endif
  return status;
}