in openmp/runtime/src/kmp_runtime.cpp [1355:2307]
int __kmp_fork_call(ident_t *loc, int gtid,
enum fork_context_e call_context, // Intel, GNU, ...
kmp_int32 argc, microtask_t microtask, launch_t invoker,
kmp_va_list ap) {
void **argv;
int i;
int master_tid;
int master_this_cons;
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
kmp_root_t *root;
int nthreads;
int master_active;
int master_set_numthreads;
int level;
int active_level;
int teams_level;
#if KMP_NESTED_HOT_TEAMS
kmp_hot_team_ptr_t **p_hot_teams;
#endif
{ // KMP_TIME_BLOCK
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
/* Some systems prefer the stack for the root thread(s) to start with */
/* some gap from the parent stack to prevent false sharing. */
void *dummy = KMP_ALLOCA(__kmp_stkpadding);
/* These 2 lines below are so this does not get optimized out */
if (__kmp_stkpadding > KMP_MAX_STKPADDING)
__kmp_stkpadding += (short)((kmp_int64)dummy);
}
/* initialize if needed */
KMP_DEBUG_ASSERT(
__kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
__kmp_resume_if_soft_paused();
/* setup current data */
master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
// shutdown
parent_team = master_th->th.th_team;
master_tid = master_th->th.th_info.ds.ds_tid;
master_this_cons = master_th->th.th_local.this_construct;
root = master_th->th.th_root;
master_active = root->r.r_active;
master_set_numthreads = master_th->th.th_set_nproc;
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data = ompt_data_none;
ompt_data_t *parent_task_data;
ompt_frame_t *ompt_frame;
ompt_data_t *implicit_task_data;
void *return_address = NULL;
if (ompt_enabled.enabled) {
__ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
NULL, NULL);
return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
}
#endif
// Assign affinity to root thread if it hasn't happened yet
__kmp_assign_root_init_mask();
// Nested level will be an index in the nested nthreads array
level = parent_team->t.t_level;
// used to launch non-serial teams even if nested is not allowed
active_level = parent_team->t.t_active_level;
// needed to check nesting inside the teams
teams_level = master_th->th.th_teams_level;
#if KMP_NESTED_HOT_TEAMS
p_hot_teams = &master_th->th.th_hot_teams;
if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
*p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
(*p_hot_teams)[0].hot_team = root->r.r_hot_team;
// it is either actual or not needed (when active_level > 0)
(*p_hot_teams)[0].hot_team_nth = 1;
}
#endif
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
if (ompt_enabled.ompt_callback_parallel_begin) {
int team_size = master_set_numthreads
? master_set_numthreads
: get__nproc_2(parent_team, master_tid);
int flags = OMPT_INVOKER(call_context) |
((microtask == (microtask_t)__kmp_teams_master)
? ompt_parallel_league
: ompt_parallel_team);
ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
master_th->th.th_ident = loc;
if (master_th->th.th_teams_microtask && ap &&
microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
// AC: This is start of parallel that is nested inside teams construct.
// The team is actual (hot), all workers are ready at the fork barrier.
// No lock needed to initialize the team a bit, then free workers.
parent_team->t.t_ident = loc;
__kmp_alloc_argv_entries(argc, parent_team, TRUE);
parent_team->t.t_argc = argc;
argv = (void **)parent_team->t.t_argv;
for (i = argc - 1; i >= 0; --i)
*argv++ = va_arg(kmp_va_deref(ap), void *);
// Increment our nested depth levels, but not increase the serialization
if (parent_team == master_th->th.th_serial_team) {
// AC: we are in serialized parallel
__kmpc_serialized_parallel(loc, gtid);
KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
if (call_context == fork_context_gnu) {
// AC: need to decrement t_serialized for enquiry functions to work
// correctly, will restore at join time
parent_team->t.t_serialized--;
return TRUE;
}
#if OMPD_SUPPORT
parent_team->t.t_pkfn = microtask;
#endif
#if OMPT_SUPPORT
void *dummy;
void **exit_frame_p;
ompt_lw_taskteam_t lw_taskteam;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
&ompt_parallel_data, return_address);
exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
// don't use lw_taskteam after linking. content was swaped
/* OMPT implicit task begin */
implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
if (ompt_enabled.ompt_callback_implicit_task) {
OMPT_CUR_TASK_INFO(master_th)->thread_num =
__kmp_tid_from_gtid(gtid);
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
implicit_task_data, 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
/* OMPT state */
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_frame_p = &dummy;
}
#endif
// AC: need to decrement t_serialized for enquiry functions to work
// correctly, will restore at join time
parent_team->t.t_serialized--;
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
#if OMPT_SUPPORT
,
exit_frame_p
#endif
);
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
*exit_frame_p = NULL;
OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, implicit_task_data, 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
&ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
OMPT_INVOKER(call_context) | ompt_parallel_team,
return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
return TRUE;
}
parent_team->t.t_pkfn = microtask;
parent_team->t.t_invoke = invoker;
KMP_ATOMIC_INC(&root->r.r_in_parallel);
parent_team->t.t_active_level++;
parent_team->t.t_level++;
parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
ompt_lw_taskteam_t lw_taskteam;
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
&ompt_parallel_data, return_address);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
}
#endif
/* Change number of threads in the team if requested */
if (master_set_numthreads) { // The parallel has num_threads clause
if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
// AC: only can reduce number of threads dynamically, can't increase
kmp_info_t **other_threads = parent_team->t.t_threads;
// NOTE: if using distributed barrier, we need to run this code block
// even when the team size appears not to have changed from the max.
int old_proc = master_th->th.th_teams_size.nth;
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
bp_dist_bar) {
__kmp_resize_dist_barrier(parent_team, old_proc,
master_set_numthreads);
__kmp_add_threads_to_team(parent_team, master_set_numthreads);
}
parent_team->t.t_nproc = master_set_numthreads;
for (i = 0; i < master_set_numthreads; ++i) {
other_threads[i]->th.th_team_nproc = master_set_numthreads;
}
}
// Keep extra threads hot in the team for possible next parallels
master_th->th.th_set_nproc = 0;
}
#if USE_DEBUGGER
if (__kmp_debugging) { // Let debugger override number of threads.
int nth = __kmp_omp_num_threads(loc);
if (nth > 0) { // 0 means debugger doesn't want to change num threads
master_set_numthreads = nth;
}
}
#endif
// Figure out the proc_bind policy for the nested parallel within teams
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
// proc_bind_default means don't update
kmp_proc_bind_t proc_bind_icv = proc_bind_default;
if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else {
// No proc_bind clause specified; use current proc-bind-var
if (proc_bind == proc_bind_default) {
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
}
/* else: The proc_bind policy was specified explicitly on parallel
clause.
This overrides proc-bind-var for this parallel region, but does not
change proc-bind-var. */
// Figure the value of proc-bind-var for the child threads.
if ((level + 1 < __kmp_nested_proc_bind.used) &&
(__kmp_nested_proc_bind.bind_types[level + 1] !=
master_th->th.th_current_task->td_icvs.proc_bind)) {
proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
}
}
KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
// Need to change the bind-var ICV to correct value for each implicit task
if (proc_bind_icv != proc_bind_default &&
master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
kmp_info_t **other_threads = parent_team->t.t_threads;
for (i = 0; i < master_th->th.th_team_nproc; ++i) {
other_threads[i]->th.th_current_task->td_icvs.proc_bind =
proc_bind_icv;
}
}
// Reset for next parallel region
master_th->th.th_set_proc_bind = proc_bind_default;
#if USE_ITT_BUILD && USE_ITT_NOTIFY
if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
KMP_ITT_DEBUG) &&
__kmp_forkjoin_frames_mode == 3 &&
parent_team->t.t_active_level == 1 // only report frames at level 1
&& master_th->th.th_teams_size.nteams == 1) {
kmp_uint64 tmp_time = __itt_get_timestamp();
master_th->th.th_frame_time = tmp_time;
parent_team->t.t_region_time = tmp_time;
}
if (__itt_stack_caller_create_ptr) {
KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
// create new stack stitching id before entering fork barrier
parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
}
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
#if KMP_AFFINITY_SUPPORTED
__kmp_partition_places(parent_team);
#endif
KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, parent_team, master_th, gtid));
__kmp_internal_fork(loc, gtid, parent_team);
KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, parent_team, master_th, gtid));
if (call_context == fork_context_gnu)
return TRUE;
/* Invoke microtask for PRIMARY thread */
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
parent_team->t.t_id, parent_team->t.t_pkfn));
if (!parent_team->t.t_invoke(gtid)) {
KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
}
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
parent_team->t.t_id, parent_team->t.t_pkfn));
KMP_MB(); /* Flush all pending memory write invalidates. */
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
return TRUE;
} // Parallel closely nested in teams construct
#if KMP_DEBUG
if (__kmp_tasking_mode != tskm_immediate_exec) {
KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
parent_team->t.t_task_team[master_th->th.th_task_state]);
}
#endif
// Need this to happen before we determine the number of threads, not while
// we are allocating the team
//__kmp_push_current_task_to_thread(master_th, parent_team, 0);
int enter_teams = 0;
if (parent_team->t.t_active_level >=
master_th->th.th_current_task->td_icvs.max_active_levels) {
nthreads = 1;
} else {
enter_teams = ((ap == NULL && active_level == 0) ||
(ap && teams_level > 0 && teams_level == level));
nthreads = master_set_numthreads
? master_set_numthreads
// TODO: get nproc directly from current task
: get__nproc_2(parent_team, master_tid);
// Check if we need to take forkjoin lock? (no need for serialized
// parallel out of teams construct). This code moved here from
// __kmp_reserve_threads() to speedup nested serialized parallels.
if (nthreads > 1) {
if ((get__max_active_levels(master_th) == 1 &&
(root->r.r_in_parallel && !enter_teams)) ||
(__kmp_library == library_serial)) {
KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
" threads\n",
gtid, nthreads));
nthreads = 1;
}
}
if (nthreads > 1) {
/* determine how many new threads we can use */
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
/* AC: If we execute teams from parallel region (on host), then teams
should be created but each can only have 1 thread if nesting is
disabled. If teams called from serial region, then teams and their
threads should be created regardless of the nesting setting. */
nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
nthreads, enter_teams);
if (nthreads == 1) {
// Free lock for single thread execution here; for multi-thread
// execution it will be freed later after team of threads created
// and initialized
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
}
}
}
KMP_DEBUG_ASSERT(nthreads > 0);
// If we temporarily changed the set number of threads then restore it now
master_th->th.th_set_nproc = 0;
/* create a serialized parallel region? */
if (nthreads == 1) {
/* josh todo: hypothetical question: what do we do for OS X*? */
#if KMP_OS_LINUX && \
(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
void *args[argc];
#else
void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
KMP_ARCH_AARCH64) */
KA_TRACE(20,
("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
__kmpc_serialized_parallel(loc, gtid);
#if OMPD_SUPPORT
master_th->th.th_serial_team->t.t_pkfn = microtask;
#endif
if (call_context == fork_context_intel) {
/* TODO this sucks, use the compiler itself to pass args! :) */
master_th->th.th_serial_team->t.t_ident = loc;
if (!ap) {
// revert change made in __kmpc_serialized_parallel()
master_th->th.th_serial_team->t.t_level--;
// Get args from parent team for teams construct
#if OMPT_SUPPORT
void *dummy;
void **exit_frame_p;
ompt_task_info_t *task_info;
ompt_lw_taskteam_t lw_taskteam;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
&ompt_parallel_data, return_address);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
// don't use lw_taskteam after linking. content was swaped
task_info = OMPT_CUR_TASK_INFO(master_th);
exit_frame_p = &(task_info->frame.exit_frame.ptr);
if (ompt_enabled.ompt_callback_implicit_task) {
OMPT_CUR_TASK_INFO(master_th)->thread_num =
__kmp_tid_from_gtid(gtid);
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
&(task_info->task_data), 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num,
ompt_task_implicit);
}
/* OMPT state */
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_frame_p = &dummy;
}
#endif
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc,
parent_team->t.t_argv
#if OMPT_SUPPORT
,
exit_frame_p
#endif
);
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
*exit_frame_p = NULL;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num,
ompt_task_implicit);
}
ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
&ompt_parallel_data, parent_task_data,
OMPT_INVOKER(call_context) | ompt_parallel_team,
return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
} else if (microtask == (microtask_t)__kmp_teams_master) {
KMP_DEBUG_ASSERT(master_th->th.th_team ==
master_th->th.th_serial_team);
team = master_th->th.th_team;
// team->t.t_pkfn = microtask;
team->t.t_invoke = invoker;
__kmp_alloc_argv_entries(argc, team, TRUE);
team->t.t_argc = argc;
argv = (void **)team->t.t_argv;
if (ap) {
for (i = argc - 1; i >= 0; --i)
*argv++ = va_arg(kmp_va_deref(ap), void *);
} else {
for (i = 0; i < argc; ++i)
// Get args from parent team for teams construct
argv[i] = parent_team->t.t_argv[i];
}
// AC: revert change made in __kmpc_serialized_parallel()
// because initial code in teams should have level=0
team->t.t_level--;
// AC: call special invoker for outer "parallel" of teams construct
invoker(gtid);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), 0,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
}
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
&ompt_parallel_data, parent_task_data,
OMPT_INVOKER(call_context) | ompt_parallel_league,
return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
} else {
argv = args;
for (i = argc - 1; i >= 0; --i)
*argv++ = va_arg(kmp_va_deref(ap), void *);
KMP_MB();
#if OMPT_SUPPORT
void *dummy;
void **exit_frame_p;
ompt_task_info_t *task_info;
ompt_lw_taskteam_t lw_taskteam;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
&ompt_parallel_data, return_address);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
// don't use lw_taskteam after linking. content was swaped
task_info = OMPT_CUR_TASK_INFO(master_th);
exit_frame_p = &(task_info->frame.exit_frame.ptr);
/* OMPT implicit task begin */
implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
ompt_task_implicit);
OMPT_CUR_TASK_INFO(master_th)->thread_num =
__kmp_tid_from_gtid(gtid);
}
/* OMPT state */
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_frame_p = &dummy;
}
#endif
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc, args
#if OMPT_SUPPORT
,
exit_frame_p
#endif
);
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
*exit_frame_p = NULL;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num,
ompt_task_implicit);
}
ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
&ompt_parallel_data, parent_task_data,
OMPT_INVOKER(call_context) | ompt_parallel_team,
return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
}
} else if (call_context == fork_context_gnu) {
#if OMPT_SUPPORT
ompt_lw_taskteam_t lwt;
__ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
return_address);
lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
__ompt_lw_taskteam_link(&lwt, master_th, 1);
// don't use lw_taskteam after linking. content was swaped
#endif
// we were called from GNU native code
KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
return FALSE;
} else {
KMP_ASSERT2(call_context < fork_context_last,
"__kmp_fork_call: unknown fork_context parameter");
}
KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
KMP_MB();
return FALSE;
} // if (nthreads == 1)
// GEH: only modify the executing flag in the case when not serialized
// serialized case is handled in kmpc_serialized_parallel
KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
"curtask=%p, curtask_max_aclevel=%d\n",
parent_team->t.t_active_level, master_th,
master_th->th.th_current_task,
master_th->th.th_current_task->td_icvs.max_active_levels));
// TODO: GEH - cannot do this assertion because root thread not set up as
// executing
// KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
master_th->th.th_current_task->td_flags.executing = 0;
if (!master_th->th.th_teams_microtask || level > teams_level) {
/* Increment our nested depth level */
KMP_ATOMIC_INC(&root->r.r_in_parallel);
}
// See if we need to make a copy of the ICVs.
int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
if ((level + 1 < __kmp_nested_nth.used) &&
(__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
nthreads_icv = __kmp_nested_nth.nth[level + 1];
} else {
nthreads_icv = 0; // don't update
}
// Figure out the proc_bind_policy for the new team.
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
// proc_bind_default means don't update
kmp_proc_bind_t proc_bind_icv = proc_bind_default;
if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else {
// No proc_bind clause specified; use current proc-bind-var for this
// parallel region
if (proc_bind == proc_bind_default) {
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
}
// Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
if (master_th->th.th_teams_microtask &&
microtask == (microtask_t)__kmp_teams_master) {
proc_bind = __kmp_teams_proc_bind;
}
/* else: The proc_bind policy was specified explicitly on parallel clause.
This overrides proc-bind-var for this parallel region, but does not
change proc-bind-var. */
// Figure the value of proc-bind-var for the child threads.
if ((level + 1 < __kmp_nested_proc_bind.used) &&
(__kmp_nested_proc_bind.bind_types[level + 1] !=
master_th->th.th_current_task->td_icvs.proc_bind)) {
// Do not modify the proc bind icv for the two teams construct forks
// They just let the proc bind icv pass through
if (!master_th->th.th_teams_microtask ||
!(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
}
}
// Reset for next parallel region
master_th->th.th_set_proc_bind = proc_bind_default;
if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
kmp_internal_control_t new_icvs;
copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
new_icvs.next = NULL;
if (nthreads_icv > 0) {
new_icvs.nproc = nthreads_icv;
}
if (proc_bind_icv != proc_bind_default) {
new_icvs.proc_bind = proc_bind_icv;
}
/* allocate a new parallel team */
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMPT_SUPPORT
ompt_parallel_data,
#endif
proc_bind, &new_icvs,
argc USE_NESTED_HOT_ARG(master_th));
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
} else {
/* allocate a new parallel team */
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMPT_SUPPORT
ompt_parallel_data,
#endif
proc_bind,
&master_th->th.th_current_task->td_icvs,
argc USE_NESTED_HOT_ARG(master_th));
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
&master_th->th.th_current_task->td_icvs);
}
KF_TRACE(
10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
/* setup the new team */
KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
KMP_CHECK_UPDATE(team->t.t_ident, loc);
KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
#if OMPT_SUPPORT
KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
return_address);
#endif
KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
// TODO: parent_team->t.t_level == INT_MAX ???
if (!master_th->th.th_teams_microtask || level > teams_level) {
int new_level = parent_team->t.t_level + 1;
KMP_CHECK_UPDATE(team->t.t_level, new_level);
new_level = parent_team->t.t_active_level + 1;
KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
} else {
// AC: Do not increase parallel level at start of the teams construct
int new_level = parent_team->t.t_level;
KMP_CHECK_UPDATE(team->t.t_level, new_level);
new_level = parent_team->t.t_active_level;
KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
}
kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
// set primary thread's schedule as new run-time schedule
KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
// Update the floating point rounding in the team if required.
propagateFPControl(team);
#if OMPD_SUPPORT
if (ompd_state & OMPD_ENABLE_BP)
ompd_bp_parallel_begin();
#endif
if (__kmp_tasking_mode != tskm_immediate_exec) {
// Set primary thread's task team to team's task team. Unless this is hot
// team, it should be NULL.
KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
parent_team->t.t_task_team[master_th->th.th_task_state]);
KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
"%p, new task_team %p / team %p\n",
__kmp_gtid_from_thread(master_th),
master_th->th.th_task_team, parent_team,
team->t.t_task_team[master_th->th.th_task_state], team));
if (active_level || master_th->th.th_task_team) {
// Take a memo of primary thread's task_state
KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
if (master_th->th.th_task_state_top >=
master_th->th.th_task_state_stack_sz) { // increase size
kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
kmp_uint8 *old_stack, *new_stack;
kmp_uint32 i;
new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
new_stack[i] = master_th->th.th_task_state_memo_stack[i];
}
for (i = master_th->th.th_task_state_stack_sz; i < new_size;
++i) { // zero-init rest of stack
new_stack[i] = 0;
}
old_stack = master_th->th.th_task_state_memo_stack;
master_th->th.th_task_state_memo_stack = new_stack;
master_th->th.th_task_state_stack_sz = new_size;
__kmp_free(old_stack);
}
// Store primary thread's task_state on stack
master_th->th
.th_task_state_memo_stack[master_th->th.th_task_state_top] =
master_th->th.th_task_state;
master_th->th.th_task_state_top++;
#if KMP_NESTED_HOT_TEAMS
if (master_th->th.th_hot_teams &&
active_level < __kmp_hot_teams_max_level &&
team == master_th->th.th_hot_teams[active_level].hot_team) {
// Restore primary thread's nested state if nested hot team
master_th->th.th_task_state =
master_th->th
.th_task_state_memo_stack[master_th->th.th_task_state_top];
} else {
#endif
master_th->th.th_task_state = 0;
#if KMP_NESTED_HOT_TEAMS
}
#endif
}
#if !KMP_NESTED_HOT_TEAMS
KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
(team == root->r.r_hot_team));
#endif
}
KA_TRACE(
20,
("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
team->t.t_nproc));
KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
(team->t.t_master_tid == 0 &&
(team->t.t_parent == root->r.r_root_team ||
team->t.t_parent->t.t_serialized)));
KMP_MB();
/* now, setup the arguments */
argv = (void **)team->t.t_argv;
if (ap) {
for (i = argc - 1; i >= 0; --i) {
void *new_argv = va_arg(kmp_va_deref(ap), void *);
KMP_CHECK_UPDATE(*argv, new_argv);
argv++;
}
} else {
for (i = 0; i < argc; ++i) {
// Get args from parent team for teams construct
KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
}
}
/* now actually fork the threads */
KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
root->r.r_active = TRUE;
__kmp_fork_team_threads(root, team, master_th, gtid, !ap);
__kmp_setup_icv_copy(team, nthreads,
&master_th->th.th_current_task->td_icvs, loc);
#if OMPT_SUPPORT
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
#endif
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
#if USE_ITT_BUILD
if (team->t.t_active_level == 1 // only report frames at level 1
&& !master_th->th.th_teams_microtask) { // not in teams construct
#if USE_ITT_NOTIFY
if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
(__kmp_forkjoin_frames_mode == 3 ||
__kmp_forkjoin_frames_mode == 1)) {
kmp_uint64 tmp_time = 0;
if (__itt_get_timestamp_ptr)
tmp_time = __itt_get_timestamp();
// Internal fork - report frame begin
master_th->th.th_frame_time = tmp_time;
if (__kmp_forkjoin_frames_mode == 3)
team->t.t_region_time = tmp_time;
} else
// only one notification scheme (either "submit" or "forking/joined", not both)
#endif /* USE_ITT_NOTIFY */
if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
__kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
// Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
__kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
}
}
#endif /* USE_ITT_BUILD */
/* now go on and do the work */
KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
KMP_MB();
KF_TRACE(10,
("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
root, team, master_th, gtid));
#if USE_ITT_BUILD
if (__itt_stack_caller_create_ptr) {
// create new stack stitching id before entering fork barrier
if (!enter_teams) {
KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
team->t.t_stack_id = __kmp_itt_stack_caller_create();
} else if (parent_team->t.t_serialized) {
// keep stack stitching id in the serialized parent_team;
// current team will be used for parallel inside the teams;
// if parent_team is active, then it already keeps stack stitching id
// for the league of teams
KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
}
}
#endif /* USE_ITT_BUILD */
// AC: skip __kmp_internal_fork at teams construct, let only primary
// threads execute
if (ap) {
__kmp_internal_fork(loc, gtid, team);
KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, team, master_th, gtid));
}
if (call_context == fork_context_gnu) {
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
return TRUE;
}
/* Invoke microtask for PRIMARY thread */
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
team->t.t_id, team->t.t_pkfn));
} // END of timer KMP_fork_call block
#if KMP_STATS_ENABLED
// If beginning a teams construct, then change thread state
stats_state_e previous_state = KMP_GET_THREAD_STATE();
if (!ap) {
KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
}
#endif
if (!team->t.t_invoke(gtid)) {
KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
}
#if KMP_STATS_ENABLED
// If was beginning of a teams construct, then reset thread state
if (!ap) {
KMP_SET_THREAD_STATE(previous_state);
}
#endif
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
team->t.t_id, team->t.t_pkfn));
KMP_MB(); /* Flush all pending memory write invalidates. */
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
return TRUE;
}