in openmp/runtime/src/kmp_dispatch.cpp [1161:1951]
int __kmp_dispatch_next_algorithm(int gtid,
dispatch_private_info_template<T> *pr,
dispatch_shared_info_template<T> volatile *sh,
kmp_int32 *p_last, T *p_lb, T *p_ub,
typename traits_t<T>::signed_t *p_st, T nproc,
T tid) {
typedef typename traits_t<T>::unsigned_t UT;
typedef typename traits_t<T>::signed_t ST;
typedef typename traits_t<T>::floating_t DBL;
int status = 0;
bool last = false;
T start;
ST incr;
UT limit, trip, init;
kmp_info_t *th = __kmp_threads[gtid];
kmp_team_t *team = th->th.th_team;
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
KMP_DEBUG_ASSERT(pr);
KMP_DEBUG_ASSERT(sh);
KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
#ifdef KMP_DEBUG
{
char *buff;
// create format specifiers before the debug output
buff =
__kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
"sh:%%p nproc:%%%s tid:%%%s\n",
traits_t<T>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
__kmp_str_free(&buff);
}
#endif
// zero trip count
if (pr->u.p.tc == 0) {
KD_TRACE(10,
("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
"zero status:%d\n",
gtid, status));
return 0;
}
switch (pr->schedule) {
#if KMP_STATIC_STEAL_ENABLED
case kmp_sch_static_steal: {
T chunk = pr->u.p.parm1;
UT nchunks = pr->u.p.parm2;
KD_TRACE(100,
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
gtid));
trip = pr->u.p.tc - 1;
if (traits_t<T>::type_size > 4) {
// use lock for 8-byte induction variable.
// TODO (optional): check presence and use 16-byte CAS
kmp_lock_t *lck = pr->u.p.steal_lock;
KMP_DEBUG_ASSERT(lck != NULL);
if (pr->u.p.count < (UT)pr->u.p.ub) {
KMP_DEBUG_ASSERT(pr->steal_flag == READY);
__kmp_acquire_lock(lck, gtid);
// try to get own chunk of iterations
init = (pr->u.p.count)++;
status = (init < (UT)pr->u.p.ub);
__kmp_release_lock(lck, gtid);
} else {
status = 0; // no own chunks
}
if (!status) { // try to steal
kmp_lock_t *lckv; // victim buffer's lock
T while_limit = pr->u.p.parm3;
T while_index = 0;
int idx = (th->th.th_dispatch->th_disp_index - 1) %
__kmp_dispatch_num_buffers; // current loop index
// note: victim thread can potentially execute another loop
KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
while ((!status) && (while_limit != ++while_index)) {
dispatch_private_info_template<T> *v;
T remaining;
T victimId = pr->u.p.parm4;
T oldVictimId = victimId ? victimId - 1 : nproc - 1;
v = reinterpret_cast<dispatch_private_info_template<T> *>(
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
KMP_DEBUG_ASSERT(v);
while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
oldVictimId != victimId) {
victimId = (victimId + 1) % nproc;
v = reinterpret_cast<dispatch_private_info_template<T> *>(
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
KMP_DEBUG_ASSERT(v);
}
if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
continue; // try once more (nproc attempts in total)
}
if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
kmp_uint32 old = UNUSED;
// try to steal whole range from inactive victim
status = v->steal_flag.compare_exchange_strong(old, THIEF);
if (status) {
// initialize self buffer with victim's whole range of chunks
T id = victimId;
T small_chunk, extras;
small_chunk = nchunks / nproc; // chunks per thread
extras = nchunks % nproc;
init = id * small_chunk + (id < extras ? id : extras);
__kmp_acquire_lock(lck, gtid);
pr->u.p.count = init + 1; // exclude one we execute immediately
pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
__kmp_release_lock(lck, gtid);
pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
// no need to reinitialize other thread invariants: lb, st, etc.
#ifdef KMP_DEBUG
{
char *buff;
// create format specifiers before the debug output
buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
"count:%%%s ub:%%%s\n",
traits_t<UT>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
__kmp_str_free(&buff);
}
#endif
// activate non-empty buffer and let others steal from us
if (pr->u.p.count < (UT)pr->u.p.ub)
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
break;
}
}
if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
v->u.p.count >= (UT)v->u.p.ub) {
pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
continue; // no chunks to steal, try next victim
}
lckv = v->u.p.steal_lock;
KMP_ASSERT(lckv != NULL);
__kmp_acquire_lock(lckv, gtid);
limit = v->u.p.ub; // keep initial ub
if (v->u.p.count >= limit) {
__kmp_release_lock(lckv, gtid);
pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
continue; // no chunks to steal, try next victim
}
// stealing succeded, reduce victim's ub by 1/4 of undone chunks
// TODO: is this heuristics good enough??
remaining = limit - v->u.p.count;
if (remaining > 7) {
// steal 1/4 of remaining
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
init = (v->u.p.ub -= (remaining >> 2));
} else {
// steal 1 chunk of 1..7 remaining
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
init = (v->u.p.ub -= 1);
}
__kmp_release_lock(lckv, gtid);
#ifdef KMP_DEBUG
{
char *buff;
// create format specifiers before the debug output
buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
"count:%%%s ub:%%%s\n",
traits_t<UT>::spec, traits_t<UT>::spec);
KD_TRACE(10, (buff, gtid, victimId, init, limit));
__kmp_str_free(&buff);
}
#endif
KMP_DEBUG_ASSERT(init + 1 <= limit);
pr->u.p.parm4 = victimId; // remember victim to steal from
status = 1;
// now update own count and ub with stolen range excluding init chunk
__kmp_acquire_lock(lck, gtid);
pr->u.p.count = init + 1;
pr->u.p.ub = limit;
__kmp_release_lock(lck, gtid);
// activate non-empty buffer and let others steal from us
if (init + 1 < limit)
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
} // while (search for victim)
} // if (try to find victim and steal)
} else {
// 4-byte induction variable, use 8-byte CAS for pair (count, ub)
// as all operations on pair (count, ub) must be done atomically
typedef union {
struct {
UT count;
T ub;
} p;
kmp_int64 b;
} union_i4;
union_i4 vold, vnew;
if (pr->u.p.count < (UT)pr->u.p.ub) {
KMP_DEBUG_ASSERT(pr->steal_flag == READY);
vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
vnew.b = vold.b;
vnew.p.count++; // get chunk from head of self range
while (!KMP_COMPARE_AND_STORE_REL64(
(volatile kmp_int64 *)&pr->u.p.count,
*VOLATILE_CAST(kmp_int64 *) & vold.b,
*VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
KMP_CPU_PAUSE();
vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
vnew.b = vold.b;
vnew.p.count++;
}
init = vold.p.count;
status = (init < (UT)vold.p.ub);
} else {
status = 0; // no own chunks
}
if (!status) { // try to steal
T while_limit = pr->u.p.parm3;
T while_index = 0;
int idx = (th->th.th_dispatch->th_disp_index - 1) %
__kmp_dispatch_num_buffers; // current loop index
// note: victim thread can potentially execute another loop
KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
while ((!status) && (while_limit != ++while_index)) {
dispatch_private_info_template<T> *v;
T remaining;
T victimId = pr->u.p.parm4;
T oldVictimId = victimId ? victimId - 1 : nproc - 1;
v = reinterpret_cast<dispatch_private_info_template<T> *>(
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
KMP_DEBUG_ASSERT(v);
while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
oldVictimId != victimId) {
victimId = (victimId + 1) % nproc;
v = reinterpret_cast<dispatch_private_info_template<T> *>(
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
KMP_DEBUG_ASSERT(v);
}
if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
continue; // try once more (nproc attempts in total)
}
if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
kmp_uint32 old = UNUSED;
// try to steal whole range from inactive victim
status = v->steal_flag.compare_exchange_strong(old, THIEF);
if (status) {
// initialize self buffer with victim's whole range of chunks
T id = victimId;
T small_chunk, extras;
small_chunk = nchunks / nproc; // chunks per thread
extras = nchunks % nproc;
init = id * small_chunk + (id < extras ? id : extras);
vnew.p.count = init + 1;
vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
// write pair (count, ub) at once atomically
#if KMP_ARCH_X86
KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
#else
*(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
#endif
pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
// no need to initialize other thread invariants: lb, st, etc.
#ifdef KMP_DEBUG
{
char *buff;
// create format specifiers before the debug output
buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
"count:%%%s ub:%%%s\n",
traits_t<UT>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
__kmp_str_free(&buff);
}
#endif
// activate non-empty buffer and let others steal from us
if (pr->u.p.count < (UT)pr->u.p.ub)
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
break;
}
}
while (1) { // CAS loop with check if victim still has enough chunks
// many threads may be stealing concurrently from same victim
vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
vold.p.count >= (UT)vold.p.ub) {
pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
break; // no chunks to steal, try next victim
}
vnew.b = vold.b;
remaining = vold.p.ub - vold.p.count;
// try to steal 1/4 of remaining
// TODO: is this heuristics good enough??
if (remaining > 7) {
vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
} else {
vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
}
KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
if (KMP_COMPARE_AND_STORE_REL64(
(volatile kmp_int64 *)&v->u.p.count,
*VOLATILE_CAST(kmp_int64 *) & vold.b,
*VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
// stealing succedded
#ifdef KMP_DEBUG
{
char *buff;
// create format specifiers before the debug output
buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
"count:%%%s ub:%%%s\n",
traits_t<T>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
__kmp_str_free(&buff);
}
#endif
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
vold.p.ub - vnew.p.ub);
status = 1;
pr->u.p.parm4 = victimId; // keep victim id
// now update own count and ub
init = vnew.p.ub;
vold.p.count = init + 1;
#if KMP_ARCH_X86
KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
#else
*(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
#endif
// activate non-empty buffer and let others steal from us
if (vold.p.count < (UT)vold.p.ub)
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
break;
} // if (check CAS result)
KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
} // while (try to steal from particular victim)
} // while (search for victim)
} // if (try to find victim and steal)
} // if (4-byte induction variable)
if (!status) {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
} else {
start = pr->u.p.lb;
init *= chunk;
limit = chunk + init - 1;
incr = pr->u.p.st;
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
KMP_DEBUG_ASSERT(init <= trip);
// keep track of done chunks for possible early exit from stealing
// TODO: count executed chunks locally with rare update of shared location
// test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
if ((last = (limit >= trip)) != 0)
limit = trip;
if (p_st != NULL)
*p_st = incr;
if (incr == 1) {
*p_lb = start + init;
*p_ub = start + limit;
} else {
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
}
} // if
break;
} // case
#endif // KMP_STATIC_STEAL_ENABLED
case kmp_sch_static_balanced: {
KD_TRACE(
10,
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
gtid));
/* check if thread has any iteration to do */
if ((status = !pr->u.p.count) != 0) {
pr->u.p.count = 1;
*p_lb = pr->u.p.lb;
*p_ub = pr->u.p.ub;
last = (pr->u.p.parm1 != 0);
if (p_st != NULL)
*p_st = pr->u.p.st;
} else { /* no iterations to do */
pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
}
} // case
break;
case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
merged here */
case kmp_sch_static_chunked: {
T parm1;
KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
"kmp_sch_static_[affinity|chunked] case\n",
gtid));
parm1 = pr->u.p.parm1;
trip = pr->u.p.tc - 1;
init = parm1 * (pr->u.p.count + tid);
if ((status = (init <= trip)) != 0) {
start = pr->u.p.lb;
incr = pr->u.p.st;
limit = parm1 + init - 1;
if ((last = (limit >= trip)) != 0)
limit = trip;
if (p_st != NULL)
*p_st = incr;
pr->u.p.count += nproc;
if (incr == 1) {
*p_lb = start + init;
*p_ub = start + limit;
} else {
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
}
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
} // if
} // if
} // case
break;
case kmp_sch_dynamic_chunked: {
UT chunk_number;
UT chunk_size = pr->u.p.parm1;
UT nchunks = pr->u.p.parm2;
KD_TRACE(
100,
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
gtid));
chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
status = (chunk_number < nchunks);
if (!status) {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
} else {
init = chunk_size * chunk_number;
trip = pr->u.p.tc - 1;
start = pr->u.p.lb;
incr = pr->u.p.st;
if ((last = (trip - init < (UT)chunk_size)))
limit = trip;
else
limit = chunk_size + init - 1;
if (p_st != NULL)
*p_st = incr;
if (incr == 1) {
*p_lb = start + init;
*p_ub = start + limit;
} else {
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
}
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
} // if
} // if
} // case
break;
case kmp_sch_guided_iterative_chunked: {
T chunkspec = pr->u.p.parm1;
KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
"iterative case\n",
gtid));
trip = pr->u.p.tc;
// Start atomic part of calculations
while (1) {
ST remaining; // signed, because can be < 0
init = sh->u.s.iteration; // shared value
remaining = trip - init;
if (remaining <= 0) { // AC: need to compare with 0 first
// nothing to do, don't try atomic op
status = 0;
break;
}
if ((T)remaining <
pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
// use dynamic-style schedule
// atomically increment iterations, get old value
init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
(ST)chunkspec);
remaining = trip - init;
if (remaining <= 0) {
status = 0; // all iterations got by other threads
} else {
// got some iterations to work on
status = 1;
if ((T)remaining > chunkspec) {
limit = init + chunkspec - 1;
} else {
last = true; // the last chunk
limit = init + remaining - 1;
} // if
} // if
break;
} // if
limit = init + (UT)((double)remaining *
*(double *)&pr->u.p.parm3); // divide by K*nproc
if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
(ST)init, (ST)limit)) {
// CAS was successful, chunk obtained
status = 1;
--limit;
break;
} // if
} // while
if (status != 0) {
start = pr->u.p.lb;
incr = pr->u.p.st;
if (p_st != NULL)
*p_st = incr;
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
} // if
} else {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
} // if
} // case
break;
case kmp_sch_guided_simd: {
// same as iterative but curr-chunk adjusted to be multiple of given
// chunk
T chunk = pr->u.p.parm1;
KD_TRACE(100,
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
gtid));
trip = pr->u.p.tc;
// Start atomic part of calculations
while (1) {
ST remaining; // signed, because can be < 0
init = sh->u.s.iteration; // shared value
remaining = trip - init;
if (remaining <= 0) { // AC: need to compare with 0 first
status = 0; // nothing to do, don't try atomic op
break;
}
KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
// compare with K*nproc*(chunk+1), K=2 by default
if ((T)remaining < pr->u.p.parm2) {
// use dynamic-style schedule
// atomically increment iterations, get old value
init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
(ST)chunk);
remaining = trip - init;
if (remaining <= 0) {
status = 0; // all iterations got by other threads
} else {
// got some iterations to work on
status = 1;
if ((T)remaining > chunk) {
limit = init + chunk - 1;
} else {
last = true; // the last chunk
limit = init + remaining - 1;
} // if
} // if
break;
} // if
// divide by K*nproc
UT span;
__kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
&span);
UT rem = span % chunk;
if (rem) // adjust so that span%chunk == 0
span += chunk - rem;
limit = init + span;
if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
(ST)init, (ST)limit)) {
// CAS was successful, chunk obtained
status = 1;
--limit;
break;
} // if
} // while
if (status != 0) {
start = pr->u.p.lb;
incr = pr->u.p.st;
if (p_st != NULL)
*p_st = incr;
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
} // if
} else {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
} // if
} // case
break;
case kmp_sch_guided_analytical_chunked: {
T chunkspec = pr->u.p.parm1;
UT chunkIdx;
#if KMP_USE_X87CONTROL
/* for storing original FPCW value for Windows* OS on
IA-32 architecture 8-byte version */
unsigned int oldFpcw;
unsigned int fpcwSet = 0;
#endif
KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
"kmp_sch_guided_analytical_chunked case\n",
gtid));
trip = pr->u.p.tc;
KMP_DEBUG_ASSERT(nproc > 1);
KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
while (1) { /* this while loop is a safeguard against unexpected zero
chunk sizes */
chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
if (chunkIdx >= (UT)pr->u.p.parm2) {
--trip;
/* use dynamic-style scheduling */
init = chunkIdx * chunkspec + pr->u.p.count;
/* need to verify init > 0 in case of overflow in the above
* calculation */
if ((status = (init > 0 && init <= trip)) != 0) {
limit = init + chunkspec - 1;
if ((last = (limit >= trip)) != 0)
limit = trip;
}
break;
} else {
/* use exponential-style scheduling */
/* The following check is to workaround the lack of long double precision on
Windows* OS.
This check works around the possible effect that init != 0 for chunkIdx == 0.
*/
#if KMP_USE_X87CONTROL
/* If we haven't already done so, save original
FPCW and set precision to 64-bit, as Windows* OS
on IA-32 architecture defaults to 53-bit */
if (!fpcwSet) {
oldFpcw = _control87(0, 0);
_control87(_PC_64, _MCW_PC);
fpcwSet = 0x30000;
}
#endif
if (chunkIdx) {
init = __kmp_dispatch_guided_remaining<T>(
trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
KMP_DEBUG_ASSERT(init);
init = trip - init;
} else
init = 0;
limit = trip - __kmp_dispatch_guided_remaining<T>(
trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
KMP_ASSERT(init <= limit);
if (init < limit) {
KMP_DEBUG_ASSERT(limit <= trip);
--limit;
status = 1;
break;
} // if
} // if
} // while (1)
#if KMP_USE_X87CONTROL
/* restore FPCW if necessary
AC: check fpcwSet flag first because oldFpcw can be uninitialized here
*/
if (fpcwSet && (oldFpcw & fpcwSet))
_control87(oldFpcw, _MCW_PC);
#endif
if (status != 0) {
start = pr->u.p.lb;
incr = pr->u.p.st;
if (p_st != NULL)
*p_st = incr;
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
}
} else {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
}
} // case
break;
case kmp_sch_trapezoidal: {
UT index;
T parm2 = pr->u.p.parm2;
T parm3 = pr->u.p.parm3;
T parm4 = pr->u.p.parm4;
KD_TRACE(100,
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
gtid));
index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
trip = pr->u.p.tc - 1;
if ((status = ((T)index < parm3 && init <= trip)) == 0) {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
} else {
start = pr->u.p.lb;
limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
incr = pr->u.p.st;
if ((last = (limit >= trip)) != 0)
limit = trip;
if (p_st != NULL)
*p_st = incr;
if (incr == 1) {
*p_lb = start + init;
*p_ub = start + limit;
} else {
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
}
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
} // if
} // if
} // case
break;
default: {
status = 0; // to avoid complaints on uninitialized variable use
__kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
KMP_HNT(GetNewerLibrary), // Hint
__kmp_msg_null // Variadic argument list terminator
);
} break;
} // switch
if (p_last)
*p_last = last;
#ifdef KMP_DEBUG
if (pr->flags.ordered) {
char *buff;
// create format specifiers before the debug output
buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
"ordered_lower:%%%s ordered_upper:%%%s\n",
traits_t<UT>::spec, traits_t<UT>::spec);
KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
__kmp_str_free(&buff);
}
{
char *buff;
// create format specifiers before the debug output
buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
"p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
KMP_DEBUG_ASSERT(p_last);
KMP_DEBUG_ASSERT(p_st);
KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
__kmp_str_free(&buff);
}
#endif
return status;
}