int __kmp_fork

int __kmp_fork_call()

in openmp/runtime/src/kmp_runtime.cpp [1355:2307]
742 lines of code
148 McCabe index (conditional complexity)

int __kmp_fork_call(ident_t *loc, int gtid,
                    enum fork_context_e call_context, // Intel, GNU, ...
                    kmp_int32 argc, microtask_t microtask, launch_t invoker,
                    kmp_va_list ap) {
  void **argv;
  int i;
  int master_tid;
  int master_this_cons;
  kmp_team_t *team;
  kmp_team_t *parent_team;
  kmp_info_t *master_th;
  kmp_root_t *root;
  int nthreads;
  int master_active;
  int master_set_numthreads;
  int level;
  int active_level;
  int teams_level;
#if KMP_NESTED_HOT_TEAMS
  kmp_hot_team_ptr_t **p_hot_teams;
#endif
  { // KMP_TIME_BLOCK
    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
    KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);

    KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
    if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
      /* Some systems prefer the stack for the root thread(s) to start with */
      /* some gap from the parent stack to prevent false sharing. */
      void *dummy = KMP_ALLOCA(__kmp_stkpadding);
      /* These 2 lines below are so this does not get optimized out */
      if (__kmp_stkpadding > KMP_MAX_STKPADDING)
        __kmp_stkpadding += (short)((kmp_int64)dummy);
    }

    /* initialize if needed */
    KMP_DEBUG_ASSERT(
        __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
    if (!TCR_4(__kmp_init_parallel))
      __kmp_parallel_initialize();
    __kmp_resume_if_soft_paused();

    /* setup current data */
    master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
    // shutdown
    parent_team = master_th->th.th_team;
    master_tid = master_th->th.th_info.ds.ds_tid;
    master_this_cons = master_th->th.th_local.this_construct;
    root = master_th->th.th_root;
    master_active = root->r.r_active;
    master_set_numthreads = master_th->th.th_set_nproc;

#if OMPT_SUPPORT
    ompt_data_t ompt_parallel_data = ompt_data_none;
    ompt_data_t *parent_task_data;
    ompt_frame_t *ompt_frame;
    ompt_data_t *implicit_task_data;
    void *return_address = NULL;

    if (ompt_enabled.enabled) {
      __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
                                    NULL, NULL);
      return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
    }
#endif

    // Assign affinity to root thread if it hasn't happened yet
    __kmp_assign_root_init_mask();

    // Nested level will be an index in the nested nthreads array
    level = parent_team->t.t_level;
    // used to launch non-serial teams even if nested is not allowed
    active_level = parent_team->t.t_active_level;
    // needed to check nesting inside the teams
    teams_level = master_th->th.th_teams_level;
#if KMP_NESTED_HOT_TEAMS
    p_hot_teams = &master_th->th.th_hot_teams;
    if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
      *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
          sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
      (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
      // it is either actual or not needed (when active_level > 0)
      (*p_hot_teams)[0].hot_team_nth = 1;
    }
#endif

#if OMPT_SUPPORT
    if (ompt_enabled.enabled) {
      if (ompt_enabled.ompt_callback_parallel_begin) {
        int team_size = master_set_numthreads
                            ? master_set_numthreads
                            : get__nproc_2(parent_team, master_tid);
        int flags = OMPT_INVOKER(call_context) |
                    ((microtask == (microtask_t)__kmp_teams_master)
                         ? ompt_parallel_league
                         : ompt_parallel_team);
        ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
            parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
            return_address);
      }
      master_th->th.ompt_thread_info.state = ompt_state_overhead;
    }
#endif

    master_th->th.th_ident = loc;

    if (master_th->th.th_teams_microtask && ap &&
        microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
      // AC: This is start of parallel that is nested inside teams construct.
      // The team is actual (hot), all workers are ready at the fork barrier.
      // No lock needed to initialize the team a bit, then free workers.
      parent_team->t.t_ident = loc;
      __kmp_alloc_argv_entries(argc, parent_team, TRUE);
      parent_team->t.t_argc = argc;
      argv = (void **)parent_team->t.t_argv;
      for (i = argc - 1; i >= 0; --i)
        *argv++ = va_arg(kmp_va_deref(ap), void *);
      // Increment our nested depth levels, but not increase the serialization
      if (parent_team == master_th->th.th_serial_team) {
        // AC: we are in serialized parallel
        __kmpc_serialized_parallel(loc, gtid);
        KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);

        if (call_context == fork_context_gnu) {
          // AC: need to decrement t_serialized for enquiry functions to work
          // correctly, will restore at join time
          parent_team->t.t_serialized--;
          return TRUE;
        }

#if OMPD_SUPPORT
        parent_team->t.t_pkfn = microtask;
#endif

#if OMPT_SUPPORT
        void *dummy;
        void **exit_frame_p;

        ompt_lw_taskteam_t lw_taskteam;

        if (ompt_enabled.enabled) {
          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
                                  &ompt_parallel_data, return_address);
          exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);

          __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
          // don't use lw_taskteam after linking. content was swaped

          /* OMPT implicit task begin */
          implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
          if (ompt_enabled.ompt_callback_implicit_task) {
            OMPT_CUR_TASK_INFO(master_th)->thread_num =
                __kmp_tid_from_gtid(gtid);
            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
                implicit_task_data, 1,
                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
          }

          /* OMPT state */
          master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
        } else {
          exit_frame_p = &dummy;
        }
#endif
        // AC: need to decrement t_serialized for enquiry functions to work
        // correctly, will restore at join time
        parent_team->t.t_serialized--;

        {
          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
          __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
#if OMPT_SUPPORT
                                 ,
                                 exit_frame_p
#endif
          );
        }

#if OMPT_SUPPORT
        if (ompt_enabled.enabled) {
          *exit_frame_p = NULL;
          OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
          if (ompt_enabled.ompt_callback_implicit_task) {
            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                ompt_scope_end, NULL, implicit_task_data, 1,
                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
          }
          ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
          __ompt_lw_taskteam_unlink(master_th);
          if (ompt_enabled.ompt_callback_parallel_end) {
            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
                &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
                OMPT_INVOKER(call_context) | ompt_parallel_team,
                return_address);
          }
          master_th->th.ompt_thread_info.state = ompt_state_overhead;
        }
#endif
        return TRUE;
      }

      parent_team->t.t_pkfn = microtask;
      parent_team->t.t_invoke = invoker;
      KMP_ATOMIC_INC(&root->r.r_in_parallel);
      parent_team->t.t_active_level++;
      parent_team->t.t_level++;
      parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save

#if OMPT_SUPPORT
      if (ompt_enabled.enabled) {
        ompt_lw_taskteam_t lw_taskteam;
        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
                                &ompt_parallel_data, return_address);
        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
      }
#endif

      /* Change number of threads in the team if requested */
      if (master_set_numthreads) { // The parallel has num_threads clause
        if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
          // AC: only can reduce number of threads dynamically, can't increase
          kmp_info_t **other_threads = parent_team->t.t_threads;
          // NOTE: if using distributed barrier, we need to run this code block
          // even when the team size appears not to have changed from the max.
          int old_proc = master_th->th.th_teams_size.nth;
          if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
              bp_dist_bar) {
            __kmp_resize_dist_barrier(parent_team, old_proc,
                                      master_set_numthreads);
            __kmp_add_threads_to_team(parent_team, master_set_numthreads);
          }
          parent_team->t.t_nproc = master_set_numthreads;
          for (i = 0; i < master_set_numthreads; ++i) {
            other_threads[i]->th.th_team_nproc = master_set_numthreads;
          }
        }
        // Keep extra threads hot in the team for possible next parallels
        master_th->th.th_set_nproc = 0;
      }

#if USE_DEBUGGER
      if (__kmp_debugging) { // Let debugger override number of threads.
        int nth = __kmp_omp_num_threads(loc);
        if (nth > 0) { // 0 means debugger doesn't want to change num threads
          master_set_numthreads = nth;
        }
      }
#endif

      // Figure out the proc_bind policy for the nested parallel within teams
      kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
      // proc_bind_default means don't update
      kmp_proc_bind_t proc_bind_icv = proc_bind_default;
      if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
        proc_bind = proc_bind_false;
      } else {
        // No proc_bind clause specified; use current proc-bind-var
        if (proc_bind == proc_bind_default) {
          proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
        }
        /* else: The proc_bind policy was specified explicitly on parallel
           clause.
           This overrides proc-bind-var for this parallel region, but does not
           change proc-bind-var. */
        // Figure the value of proc-bind-var for the child threads.
        if ((level + 1 < __kmp_nested_proc_bind.used) &&
            (__kmp_nested_proc_bind.bind_types[level + 1] !=
             master_th->th.th_current_task->td_icvs.proc_bind)) {
          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
        }
      }
      KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
      // Need to change the bind-var ICV to correct value for each implicit task
      if (proc_bind_icv != proc_bind_default &&
          master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
        kmp_info_t **other_threads = parent_team->t.t_threads;
        for (i = 0; i < master_th->th.th_team_nproc; ++i) {
          other_threads[i]->th.th_current_task->td_icvs.proc_bind =
              proc_bind_icv;
        }
      }
      // Reset for next parallel region
      master_th->th.th_set_proc_bind = proc_bind_default;

#if USE_ITT_BUILD && USE_ITT_NOTIFY
      if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
           KMP_ITT_DEBUG) &&
          __kmp_forkjoin_frames_mode == 3 &&
          parent_team->t.t_active_level == 1 // only report frames at level 1
          && master_th->th.th_teams_size.nteams == 1) {
        kmp_uint64 tmp_time = __itt_get_timestamp();
        master_th->th.th_frame_time = tmp_time;
        parent_team->t.t_region_time = tmp_time;
      }
      if (__itt_stack_caller_create_ptr) {
        KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
        // create new stack stitching id before entering fork barrier
        parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
      }
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
#if KMP_AFFINITY_SUPPORTED
      __kmp_partition_places(parent_team);
#endif

      KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
                    "master_th=%p, gtid=%d\n",
                    root, parent_team, master_th, gtid));
      __kmp_internal_fork(loc, gtid, parent_team);
      KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
                    "master_th=%p, gtid=%d\n",
                    root, parent_team, master_th, gtid));

      if (call_context == fork_context_gnu)
        return TRUE;

      /* Invoke microtask for PRIMARY thread */
      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                    parent_team->t.t_id, parent_team->t.t_pkfn));

      if (!parent_team->t.t_invoke(gtid)) {
        KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
      }
      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
                    parent_team->t.t_id, parent_team->t.t_pkfn));
      KMP_MB(); /* Flush all pending memory write invalidates.  */

      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));

      return TRUE;
    } // Parallel closely nested in teams construct

#if KMP_DEBUG
    if (__kmp_tasking_mode != tskm_immediate_exec) {
      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
                       parent_team->t.t_task_team[master_th->th.th_task_state]);
    }
#endif

    // Need this to happen before we determine the number of threads, not while
    // we are allocating the team
    //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
    int enter_teams = 0;
    if (parent_team->t.t_active_level >=
        master_th->th.th_current_task->td_icvs.max_active_levels) {
      nthreads = 1;
    } else {
      enter_teams = ((ap == NULL && active_level == 0) ||
                     (ap && teams_level > 0 && teams_level == level));
      nthreads = master_set_numthreads
                     ? master_set_numthreads
                     // TODO: get nproc directly from current task
                     : get__nproc_2(parent_team, master_tid);
      // Check if we need to take forkjoin lock? (no need for serialized
      // parallel out of teams construct). This code moved here from
      // __kmp_reserve_threads() to speedup nested serialized parallels.
      if (nthreads > 1) {
        if ((get__max_active_levels(master_th) == 1 &&
             (root->r.r_in_parallel && !enter_teams)) ||
            (__kmp_library == library_serial)) {
          KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
                        " threads\n",
                        gtid, nthreads));
          nthreads = 1;
        }
      }
      if (nthreads > 1) {
        /* determine how many new threads we can use */
        __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
        /* AC: If we execute teams from parallel region (on host), then teams
           should be created but each can only have 1 thread if nesting is
           disabled. If teams called from serial region, then teams and their
           threads should be created regardless of the nesting setting. */
        nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
                                         nthreads, enter_teams);
        if (nthreads == 1) {
          // Free lock for single thread execution here; for multi-thread
          // execution it will be freed later after team of threads created
          // and initialized
          __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
        }
      }
    }
    KMP_DEBUG_ASSERT(nthreads > 0);

    // If we temporarily changed the set number of threads then restore it now
    master_th->th.th_set_nproc = 0;

    /* create a serialized parallel region? */
    if (nthreads == 1) {
/* josh todo: hypothetical question: what do we do for OS X*? */
#if KMP_OS_LINUX &&                                                            \
    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
      void *args[argc];
#else
      void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
          KMP_ARCH_AARCH64) */

      KA_TRACE(20,
               ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));

      __kmpc_serialized_parallel(loc, gtid);

#if OMPD_SUPPORT
      master_th->th.th_serial_team->t.t_pkfn = microtask;
#endif

      if (call_context == fork_context_intel) {
        /* TODO this sucks, use the compiler itself to pass args! :) */
        master_th->th.th_serial_team->t.t_ident = loc;
        if (!ap) {
          // revert change made in __kmpc_serialized_parallel()
          master_th->th.th_serial_team->t.t_level--;
          // Get args from parent team for teams construct

#if OMPT_SUPPORT
          void *dummy;
          void **exit_frame_p;
          ompt_task_info_t *task_info;

          ompt_lw_taskteam_t lw_taskteam;

          if (ompt_enabled.enabled) {
            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
                                    &ompt_parallel_data, return_address);

            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
            // don't use lw_taskteam after linking. content was swaped

            task_info = OMPT_CUR_TASK_INFO(master_th);
            exit_frame_p = &(task_info->frame.exit_frame.ptr);
            if (ompt_enabled.ompt_callback_implicit_task) {
              OMPT_CUR_TASK_INFO(master_th)->thread_num =
                  __kmp_tid_from_gtid(gtid);
              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
                  &(task_info->task_data), 1,
                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
                  ompt_task_implicit);
            }

            /* OMPT state */
            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
          } else {
            exit_frame_p = &dummy;
          }
#endif

          {
            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
            __kmp_invoke_microtask(microtask, gtid, 0, argc,
                                   parent_team->t.t_argv
#if OMPT_SUPPORT
                                   ,
                                   exit_frame_p
#endif
            );
          }

#if OMPT_SUPPORT
          if (ompt_enabled.enabled) {
            *exit_frame_p = NULL;
            if (ompt_enabled.ompt_callback_implicit_task) {
              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                  ompt_scope_end, NULL, &(task_info->task_data), 1,
                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
                  ompt_task_implicit);
            }
            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
            __ompt_lw_taskteam_unlink(master_th);
            if (ompt_enabled.ompt_callback_parallel_end) {
              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
                  &ompt_parallel_data, parent_task_data,
                  OMPT_INVOKER(call_context) | ompt_parallel_team,
                  return_address);
            }
            master_th->th.ompt_thread_info.state = ompt_state_overhead;
          }
#endif
        } else if (microtask == (microtask_t)__kmp_teams_master) {
          KMP_DEBUG_ASSERT(master_th->th.th_team ==
                           master_th->th.th_serial_team);
          team = master_th->th.th_team;
          // team->t.t_pkfn = microtask;
          team->t.t_invoke = invoker;
          __kmp_alloc_argv_entries(argc, team, TRUE);
          team->t.t_argc = argc;
          argv = (void **)team->t.t_argv;
          if (ap) {
            for (i = argc - 1; i >= 0; --i)
              *argv++ = va_arg(kmp_va_deref(ap), void *);
          } else {
            for (i = 0; i < argc; ++i)
              // Get args from parent team for teams construct
              argv[i] = parent_team->t.t_argv[i];
          }
          // AC: revert change made in __kmpc_serialized_parallel()
          //     because initial code in teams should have level=0
          team->t.t_level--;
          // AC: call special invoker for outer "parallel" of teams construct
          invoker(gtid);
#if OMPT_SUPPORT
          if (ompt_enabled.enabled) {
            ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
            if (ompt_enabled.ompt_callback_implicit_task) {
              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                  ompt_scope_end, NULL, &(task_info->task_data), 0,
                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
            }
            if (ompt_enabled.ompt_callback_parallel_end) {
              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
                  &ompt_parallel_data, parent_task_data,
                  OMPT_INVOKER(call_context) | ompt_parallel_league,
                  return_address);
            }
            master_th->th.ompt_thread_info.state = ompt_state_overhead;
          }
#endif
        } else {
          argv = args;
          for (i = argc - 1; i >= 0; --i)
            *argv++ = va_arg(kmp_va_deref(ap), void *);
          KMP_MB();

#if OMPT_SUPPORT
          void *dummy;
          void **exit_frame_p;
          ompt_task_info_t *task_info;

          ompt_lw_taskteam_t lw_taskteam;

          if (ompt_enabled.enabled) {
            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
                                    &ompt_parallel_data, return_address);
            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
            // don't use lw_taskteam after linking. content was swaped
            task_info = OMPT_CUR_TASK_INFO(master_th);
            exit_frame_p = &(task_info->frame.exit_frame.ptr);

            /* OMPT implicit task begin */
            implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
            if (ompt_enabled.ompt_callback_implicit_task) {
              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
                  ompt_task_implicit);
              OMPT_CUR_TASK_INFO(master_th)->thread_num =
                  __kmp_tid_from_gtid(gtid);
            }

            /* OMPT state */
            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
          } else {
            exit_frame_p = &dummy;
          }
#endif

          {
            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
            __kmp_invoke_microtask(microtask, gtid, 0, argc, args
#if OMPT_SUPPORT
                                   ,
                                   exit_frame_p
#endif
            );
          }

#if OMPT_SUPPORT
          if (ompt_enabled.enabled) {
            *exit_frame_p = NULL;
            if (ompt_enabled.ompt_callback_implicit_task) {
              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                  ompt_scope_end, NULL, &(task_info->task_data), 1,
                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
                  ompt_task_implicit);
            }

            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
            __ompt_lw_taskteam_unlink(master_th);
            if (ompt_enabled.ompt_callback_parallel_end) {
              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
                  &ompt_parallel_data, parent_task_data,
                  OMPT_INVOKER(call_context) | ompt_parallel_team,
                  return_address);
            }
            master_th->th.ompt_thread_info.state = ompt_state_overhead;
          }
#endif
        }
      } else if (call_context == fork_context_gnu) {
#if OMPT_SUPPORT
        ompt_lw_taskteam_t lwt;
        __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
                                return_address);

        lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
        __ompt_lw_taskteam_link(&lwt, master_th, 1);
// don't use lw_taskteam after linking. content was swaped
#endif

        // we were called from GNU native code
        KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
        return FALSE;
      } else {
        KMP_ASSERT2(call_context < fork_context_last,
                    "__kmp_fork_call: unknown fork_context parameter");
      }

      KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
      KMP_MB();
      return FALSE;
    } // if (nthreads == 1)

    // GEH: only modify the executing flag in the case when not serialized
    //      serialized case is handled in kmpc_serialized_parallel
    KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
                  "curtask=%p, curtask_max_aclevel=%d\n",
                  parent_team->t.t_active_level, master_th,
                  master_th->th.th_current_task,
                  master_th->th.th_current_task->td_icvs.max_active_levels));
    // TODO: GEH - cannot do this assertion because root thread not set up as
    // executing
    // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
    master_th->th.th_current_task->td_flags.executing = 0;

    if (!master_th->th.th_teams_microtask || level > teams_level) {
      /* Increment our nested depth level */
      KMP_ATOMIC_INC(&root->r.r_in_parallel);
    }

    // See if we need to make a copy of the ICVs.
    int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
    if ((level + 1 < __kmp_nested_nth.used) &&
        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
      nthreads_icv = __kmp_nested_nth.nth[level + 1];
    } else {
      nthreads_icv = 0; // don't update
    }

    // Figure out the proc_bind_policy for the new team.
    kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
    // proc_bind_default means don't update
    kmp_proc_bind_t proc_bind_icv = proc_bind_default;
    if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
      proc_bind = proc_bind_false;
    } else {
      // No proc_bind clause specified; use current proc-bind-var for this
      // parallel region
      if (proc_bind == proc_bind_default) {
        proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
      }
      // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
      if (master_th->th.th_teams_microtask &&
          microtask == (microtask_t)__kmp_teams_master) {
        proc_bind = __kmp_teams_proc_bind;
      }
      /* else: The proc_bind policy was specified explicitly on parallel clause.
         This overrides proc-bind-var for this parallel region, but does not
         change proc-bind-var. */
      // Figure the value of proc-bind-var for the child threads.
      if ((level + 1 < __kmp_nested_proc_bind.used) &&
          (__kmp_nested_proc_bind.bind_types[level + 1] !=
           master_th->th.th_current_task->td_icvs.proc_bind)) {
        // Do not modify the proc bind icv for the two teams construct forks
        // They just let the proc bind icv pass through
        if (!master_th->th.th_teams_microtask ||
            !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
      }
    }

    // Reset for next parallel region
    master_th->th.th_set_proc_bind = proc_bind_default;

    if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
      kmp_internal_control_t new_icvs;
      copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
      new_icvs.next = NULL;
      if (nthreads_icv > 0) {
        new_icvs.nproc = nthreads_icv;
      }
      if (proc_bind_icv != proc_bind_default) {
        new_icvs.proc_bind = proc_bind_icv;
      }

      /* allocate a new parallel team */
      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
      team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMPT_SUPPORT
                                 ompt_parallel_data,
#endif
                                 proc_bind, &new_icvs,
                                 argc USE_NESTED_HOT_ARG(master_th));
      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
    } else {
      /* allocate a new parallel team */
      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
      team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMPT_SUPPORT
                                 ompt_parallel_data,
#endif
                                 proc_bind,
                                 &master_th->th.th_current_task->td_icvs,
                                 argc USE_NESTED_HOT_ARG(master_th));
      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
                  &master_th->th.th_current_task->td_icvs);
    }
    KF_TRACE(
        10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));

    /* setup the new team */
    KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
    KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
    KMP_CHECK_UPDATE(team->t.t_ident, loc);
    KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
    KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
#if OMPT_SUPPORT
    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
                          return_address);
#endif
    KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
    // TODO: parent_team->t.t_level == INT_MAX ???
    if (!master_th->th.th_teams_microtask || level > teams_level) {
      int new_level = parent_team->t.t_level + 1;
      KMP_CHECK_UPDATE(team->t.t_level, new_level);
      new_level = parent_team->t.t_active_level + 1;
      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
    } else {
      // AC: Do not increase parallel level at start of the teams construct
      int new_level = parent_team->t.t_level;
      KMP_CHECK_UPDATE(team->t.t_level, new_level);
      new_level = parent_team->t.t_active_level;
      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
    }
    kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
    // set primary thread's schedule as new run-time schedule
    KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);

    KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
    KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);

    // Update the floating point rounding in the team if required.
    propagateFPControl(team);
#if OMPD_SUPPORT
    if (ompd_state & OMPD_ENABLE_BP)
      ompd_bp_parallel_begin();
#endif

    if (__kmp_tasking_mode != tskm_immediate_exec) {
      // Set primary thread's task team to team's task team. Unless this is hot
      // team, it should be NULL.
      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
                       parent_team->t.t_task_team[master_th->th.th_task_state]);
      KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
                    "%p, new task_team %p / team %p\n",
                    __kmp_gtid_from_thread(master_th),
                    master_th->th.th_task_team, parent_team,
                    team->t.t_task_team[master_th->th.th_task_state], team));

      if (active_level || master_th->th.th_task_team) {
        // Take a memo of primary thread's task_state
        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
        if (master_th->th.th_task_state_top >=
            master_th->th.th_task_state_stack_sz) { // increase size
          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
          kmp_uint8 *old_stack, *new_stack;
          kmp_uint32 i;
          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
          }
          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
               ++i) { // zero-init rest of stack
            new_stack[i] = 0;
          }
          old_stack = master_th->th.th_task_state_memo_stack;
          master_th->th.th_task_state_memo_stack = new_stack;
          master_th->th.th_task_state_stack_sz = new_size;
          __kmp_free(old_stack);
        }
        // Store primary thread's task_state on stack
        master_th->th
            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
            master_th->th.th_task_state;
        master_th->th.th_task_state_top++;
#if KMP_NESTED_HOT_TEAMS
        if (master_th->th.th_hot_teams &&
            active_level < __kmp_hot_teams_max_level &&
            team == master_th->th.th_hot_teams[active_level].hot_team) {
          // Restore primary thread's nested state if nested hot team
          master_th->th.th_task_state =
              master_th->th
                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
        } else {
#endif
          master_th->th.th_task_state = 0;
#if KMP_NESTED_HOT_TEAMS
        }
#endif
      }
#if !KMP_NESTED_HOT_TEAMS
      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
                       (team == root->r.r_hot_team));
#endif
    }

    KA_TRACE(
        20,
        ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
         gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
         team->t.t_nproc));
    KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
                     (team->t.t_master_tid == 0 &&
                      (team->t.t_parent == root->r.r_root_team ||
                       team->t.t_parent->t.t_serialized)));
    KMP_MB();

    /* now, setup the arguments */
    argv = (void **)team->t.t_argv;
    if (ap) {
      for (i = argc - 1; i >= 0; --i) {
        void *new_argv = va_arg(kmp_va_deref(ap), void *);
        KMP_CHECK_UPDATE(*argv, new_argv);
        argv++;
      }
    } else {
      for (i = 0; i < argc; ++i) {
        // Get args from parent team for teams construct
        KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
      }
    }

    /* now actually fork the threads */
    KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
    if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
      root->r.r_active = TRUE;

    __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
    __kmp_setup_icv_copy(team, nthreads,
                         &master_th->th.th_current_task->td_icvs, loc);

#if OMPT_SUPPORT
    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
#endif

    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);

#if USE_ITT_BUILD
    if (team->t.t_active_level == 1 // only report frames at level 1
        && !master_th->th.th_teams_microtask) { // not in teams construct
#if USE_ITT_NOTIFY
      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
          (__kmp_forkjoin_frames_mode == 3 ||
           __kmp_forkjoin_frames_mode == 1)) {
        kmp_uint64 tmp_time = 0;
        if (__itt_get_timestamp_ptr)
          tmp_time = __itt_get_timestamp();
        // Internal fork - report frame begin
        master_th->th.th_frame_time = tmp_time;
        if (__kmp_forkjoin_frames_mode == 3)
          team->t.t_region_time = tmp_time;
      } else
// only one notification scheme (either "submit" or "forking/joined", not both)
#endif /* USE_ITT_NOTIFY */
          if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
              __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
        // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
        __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
      }
    }
#endif /* USE_ITT_BUILD */

    /* now go on and do the work */
    KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
    KMP_MB();
    KF_TRACE(10,
             ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
              root, team, master_th, gtid));

#if USE_ITT_BUILD
    if (__itt_stack_caller_create_ptr) {
      // create new stack stitching id before entering fork barrier
      if (!enter_teams) {
        KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
        team->t.t_stack_id = __kmp_itt_stack_caller_create();
      } else if (parent_team->t.t_serialized) {
        // keep stack stitching id in the serialized parent_team;
        // current team will be used for parallel inside the teams;
        // if parent_team is active, then it already keeps stack stitching id
        // for the league of teams
        KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
        parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
      }
    }
#endif /* USE_ITT_BUILD */

    // AC: skip __kmp_internal_fork at teams construct, let only primary
    // threads execute
    if (ap) {
      __kmp_internal_fork(loc, gtid, team);
      KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
                    "master_th=%p, gtid=%d\n",
                    root, team, master_th, gtid));
    }

    if (call_context == fork_context_gnu) {
      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
      return TRUE;
    }

    /* Invoke microtask for PRIMARY thread */
    KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                  team->t.t_id, team->t.t_pkfn));
  } // END of timer KMP_fork_call block

#if KMP_STATS_ENABLED
  // If beginning a teams construct, then change thread state
  stats_state_e previous_state = KMP_GET_THREAD_STATE();
  if (!ap) {
    KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
  }
#endif

  if (!team->t.t_invoke(gtid)) {
    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
  }

#if KMP_STATS_ENABLED
  // If was beginning of a teams construct, then reset thread state
  if (!ap) {
    KMP_SET_THREAD_STATE(previous_state);
  }
#endif

  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
                team->t.t_id, team->t.t_pkfn));
  KMP_MB(); /* Flush all pending memory write invalidates.  */

  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
#if OMPT_SUPPORT
  if (ompt_enabled.enabled) {
    master_th->th.ompt_thread_info.state = ompt_state_overhead;
  }
#endif

  return TRUE;
}