LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 #include "kmp_dispatch.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 
36 /* these are temporary issues to be dealt with */
37 #define KMP_USE_PRCTL 0
38 
39 #if KMP_OS_WINDOWS
40 #include <process.h>
41 #endif
42 
43 #include "tsan_annotations.h"
44 
45 #if defined(KMP_GOMP_COMPAT)
46 char const __kmp_version_alt_comp[] =
47  KMP_VERSION_PREFIX "alternative compiler support: yes";
48 #endif /* defined(KMP_GOMP_COMPAT) */
49 
50 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
51 #if OMP_50_ENABLED
52  "5.0 (201611)";
53 #elif OMP_45_ENABLED
54  "4.5 (201511)";
55 #elif OMP_40_ENABLED
56  "4.0 (201307)";
57 #else
58  "3.1 (201107)";
59 #endif
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63  KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79  int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81  kmp_internal_control_t *new_icvs,
82  ident_t *loc);
83 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85  int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91  kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
102 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
103 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
104 
105 /* Calculate the identifier of the current thread */
106 /* fast (and somewhat portable) way to get unique identifier of executing
107  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
108 int __kmp_get_global_thread_id() {
109  int i;
110  kmp_info_t **other_threads;
111  size_t stack_data;
112  char *stack_addr;
113  size_t stack_size;
114  char *stack_base;
115 
116  KA_TRACE(
117  1000,
118  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
119  __kmp_nth, __kmp_all_nth));
120 
121  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
122  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
123  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
124  __kmp_init_gtid for this to work. */
125 
126  if (!TCR_4(__kmp_init_gtid))
127  return KMP_GTID_DNE;
128 
129 #ifdef KMP_TDATA_GTID
130  if (TCR_4(__kmp_gtid_mode) >= 3) {
131  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
132  return __kmp_gtid;
133  }
134 #endif
135  if (TCR_4(__kmp_gtid_mode) >= 2) {
136  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
137  return __kmp_gtid_get_specific();
138  }
139  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
140 
141  stack_addr = (char *)&stack_data;
142  other_threads = __kmp_threads;
143 
144  /* ATT: The code below is a source of potential bugs due to unsynchronized
145  access to __kmp_threads array. For example:
146  1. Current thread loads other_threads[i] to thr and checks it, it is
147  non-NULL.
148  2. Current thread is suspended by OS.
149  3. Another thread unregisters and finishes (debug versions of free()
150  may fill memory with something like 0xEF).
151  4. Current thread is resumed.
152  5. Current thread reads junk from *thr.
153  TODO: Fix it. --ln */
154 
155  for (i = 0; i < __kmp_threads_capacity; i++) {
156 
157  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
158  if (!thr)
159  continue;
160 
161  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
162  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
163 
164  /* stack grows down -- search through all of the active threads */
165 
166  if (stack_addr <= stack_base) {
167  size_t stack_diff = stack_base - stack_addr;
168 
169  if (stack_diff <= stack_size) {
170  /* The only way we can be closer than the allocated */
171  /* stack size is if we are running on this thread. */
172  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
173  return i;
174  }
175  }
176  }
177 
178  /* get specific to try and determine our gtid */
179  KA_TRACE(1000,
180  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
181  "thread, using TLS\n"));
182  i = __kmp_gtid_get_specific();
183 
184  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
185 
186  /* if we havn't been assigned a gtid, then return code */
187  if (i < 0)
188  return i;
189 
190  /* dynamically updated stack window for uber threads to avoid get_specific
191  call */
192  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
193  KMP_FATAL(StackOverflow, i);
194  }
195 
196  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
197  if (stack_addr > stack_base) {
198  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
199  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
201  stack_base);
202  } else {
203  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204  stack_base - stack_addr);
205  }
206 
207  /* Reprint stack bounds for ubermaster since they have been refined */
208  if (__kmp_storage_map) {
209  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
210  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
211  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
212  other_threads[i]->th.th_info.ds.ds_stacksize,
213  "th_%d stack (refinement)", i);
214  }
215  return i;
216 }
217 
218 int __kmp_get_global_thread_id_reg() {
219  int gtid;
220 
221  if (!__kmp_init_serial) {
222  gtid = KMP_GTID_DNE;
223  } else
224 #ifdef KMP_TDATA_GTID
225  if (TCR_4(__kmp_gtid_mode) >= 3) {
226  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
227  gtid = __kmp_gtid;
228  } else
229 #endif
230  if (TCR_4(__kmp_gtid_mode) >= 2) {
231  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
232  gtid = __kmp_gtid_get_specific();
233  } else {
234  KA_TRACE(1000,
235  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
236  gtid = __kmp_get_global_thread_id();
237  }
238 
239  /* we must be a new uber master sibling thread */
240  if (gtid == KMP_GTID_DNE) {
241  KA_TRACE(10,
242  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
243  "Registering a new gtid.\n"));
244  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
245  if (!__kmp_init_serial) {
246  __kmp_do_serial_initialize();
247  gtid = __kmp_gtid_get_specific();
248  } else {
249  gtid = __kmp_register_root(FALSE);
250  }
251  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
252  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
253  }
254 
255  KMP_DEBUG_ASSERT(gtid >= 0);
256 
257  return gtid;
258 }
259 
260 /* caller must hold forkjoin_lock */
261 void __kmp_check_stack_overlap(kmp_info_t *th) {
262  int f;
263  char *stack_beg = NULL;
264  char *stack_end = NULL;
265  int gtid;
266 
267  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
268  if (__kmp_storage_map) {
269  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
270  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
271 
272  gtid = __kmp_gtid_from_thread(th);
273 
274  if (gtid == KMP_GTID_MONITOR) {
275  __kmp_print_storage_map_gtid(
276  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
277  "th_%s stack (%s)", "mon",
278  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
279  } else {
280  __kmp_print_storage_map_gtid(
281  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282  "th_%d stack (%s)", gtid,
283  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284  }
285  }
286 
287  /* No point in checking ubermaster threads since they use refinement and
288  * cannot overlap */
289  gtid = __kmp_gtid_from_thread(th);
290  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
291  KA_TRACE(10,
292  ("__kmp_check_stack_overlap: performing extensive checking\n"));
293  if (stack_beg == NULL) {
294  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
295  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296  }
297 
298  for (f = 0; f < __kmp_threads_capacity; f++) {
299  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300 
301  if (f_th && f_th != th) {
302  char *other_stack_end =
303  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304  char *other_stack_beg =
305  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308 
309  /* Print the other stack values before the abort */
310  if (__kmp_storage_map)
311  __kmp_print_storage_map_gtid(
312  -1, other_stack_beg, other_stack_end,
313  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
314  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
315 
316  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
317  __kmp_msg_null);
318  }
319  }
320  }
321  }
322  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
323 }
324 
325 /* ------------------------------------------------------------------------ */
326 
327 void __kmp_infinite_loop(void) {
328  static int done = FALSE;
329 
330  while (!done) {
331  KMP_YIELD(1);
332  }
333 }
334 
335 #define MAX_MESSAGE 512
336 
337 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
338  char const *format, ...) {
339  char buffer[MAX_MESSAGE];
340  va_list ap;
341 
342  va_start(ap, format);
343  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
344  p2, (unsigned long)size, format);
345  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
346  __kmp_vprintf(kmp_err, buffer, ap);
347 #if KMP_PRINT_DATA_PLACEMENT
348  int node;
349  if (gtid >= 0) {
350  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
351  if (__kmp_storage_map_verbose) {
352  node = __kmp_get_host_node(p1);
353  if (node < 0) /* doesn't work, so don't try this next time */
354  __kmp_storage_map_verbose = FALSE;
355  else {
356  char *last;
357  int lastNode;
358  int localProc = __kmp_get_cpu_from_gtid(gtid);
359 
360  const int page_size = KMP_GET_PAGE_SIZE();
361 
362  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
363  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
364  if (localProc >= 0)
365  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
366  localProc >> 1);
367  else
368  __kmp_printf_no_lock(" GTID %d\n", gtid);
369 #if KMP_USE_PRCTL
370  /* The more elaborate format is disabled for now because of the prctl
371  * hanging bug. */
372  do {
373  last = p1;
374  lastNode = node;
375  /* This loop collates adjacent pages with the same host node. */
376  do {
377  (char *)p1 += page_size;
378  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
379  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
380  lastNode);
381  } while (p1 <= p2);
382 #else
383  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
384  (char *)p1 + (page_size - 1),
385  __kmp_get_host_node(p1));
386  if (p1 < p2) {
387  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
388  (char *)p2 + (page_size - 1),
389  __kmp_get_host_node(p2));
390  }
391 #endif
392  }
393  }
394  } else
395  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
396  }
397 #endif /* KMP_PRINT_DATA_PLACEMENT */
398  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
399 }
400 
401 void __kmp_warn(char const *format, ...) {
402  char buffer[MAX_MESSAGE];
403  va_list ap;
404 
405  if (__kmp_generate_warnings == kmp_warnings_off) {
406  return;
407  }
408 
409  va_start(ap, format);
410 
411  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
412  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
413  __kmp_vprintf(kmp_err, buffer, ap);
414  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
415 
416  va_end(ap);
417 }
418 
419 void __kmp_abort_process() {
420  // Later threads may stall here, but that's ok because abort() will kill them.
421  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
422 
423  if (__kmp_debug_buf) {
424  __kmp_dump_debug_buffer();
425  }
426 
427  if (KMP_OS_WINDOWS) {
428  // Let other threads know of abnormal termination and prevent deadlock
429  // if abort happened during library initialization or shutdown
430  __kmp_global.g.g_abort = SIGABRT;
431 
432  /* On Windows* OS by default abort() causes pop-up error box, which stalls
433  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
434  boxes. _set_abort_behavior() works well, but this function is not
435  available in VS7 (this is not problem for DLL, but it is a problem for
436  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
437  help, at least in some versions of MS C RTL.
438 
439  It seems following sequence is the only way to simulate abort() and
440  avoid pop-up error box. */
441  raise(SIGABRT);
442  _exit(3); // Just in case, if signal ignored, exit anyway.
443  } else {
444  abort();
445  }
446 
447  __kmp_infinite_loop();
448  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
452 void __kmp_abort_thread(void) {
453  // TODO: Eliminate g_abort global variable and this function.
454  // In case of abort just call abort(), it will kill all the threads.
455  __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459  that are allocated together. */
460 
461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463  gtid);
464 
465  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469  sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471  __kmp_print_storage_map_gtid(
472  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476  &thr->th.th_bar[bs_plain_barrier + 1],
477  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478  gtid);
479 
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481  &thr->th.th_bar[bs_forkjoin_barrier + 1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483  gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487  &thr->th.th_bar[bs_reduction_barrier + 1],
488  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489  gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494  that are allocated together. */
495 
496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497  int team_id, int num_thr) {
498  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500  header, team_id);
501 
502  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503  &team->t.t_bar[bs_last_barrier],
504  sizeof(kmp_balign_team_t) * bs_last_barrier,
505  "%s_%d.t_bar", header, team_id);
506 
507  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508  &team->t.t_bar[bs_plain_barrier + 1],
509  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513  &team->t.t_bar[bs_forkjoin_barrier + 1],
514  sizeof(kmp_balign_team_t),
515  "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519  &team->t.t_bar[bs_reduction_barrier + 1],
520  sizeof(kmp_balign_team_t),
521  "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524  __kmp_print_storage_map_gtid(
525  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528  __kmp_print_storage_map_gtid(
529  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533  &team->t.t_disp_buffer[num_disp_buff],
534  sizeof(dispatch_shared_info_t) * num_disp_buff,
535  "%s_%d.t_disp_buffer", header, team_id);
536 
537  __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
538  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
539  team_id);
540 }
541 
542 static void __kmp_init_allocator() {
543 #if OMP_50_ENABLED
544  __kmp_init_memkind();
545 #endif
546 }
547 static void __kmp_fini_allocator() {
548 #if OMP_50_ENABLED
549  __kmp_fini_memkind();
550 #endif
551 }
552 
553 /* ------------------------------------------------------------------------ */
554 
555 #if KMP_DYNAMIC_LIB
556 #if KMP_OS_WINDOWS
557 
558 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
559  // TODO: Change to __kmp_break_bootstrap_lock().
560  __kmp_init_bootstrap_lock(lck); // make the lock released
561 }
562 
563 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
564  int i;
565  int thread_count;
566 
567  // PROCESS_DETACH is expected to be called by a thread that executes
568  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
569  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
570  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
571  // threads can be still alive here, although being about to be terminated. The
572  // threads in the array with ds_thread==0 are most suspicious. Actually, it
573  // can be not safe to access the __kmp_threads[].
574 
575  // TODO: does it make sense to check __kmp_roots[] ?
576 
577  // Let's check that there are no other alive threads registered with the OMP
578  // lib.
579  while (1) {
580  thread_count = 0;
581  for (i = 0; i < __kmp_threads_capacity; ++i) {
582  if (!__kmp_threads)
583  continue;
584  kmp_info_t *th = __kmp_threads[i];
585  if (th == NULL)
586  continue;
587  int gtid = th->th.th_info.ds.ds_gtid;
588  if (gtid == gtid_req)
589  continue;
590  if (gtid < 0)
591  continue;
592  DWORD exit_val;
593  int alive = __kmp_is_thread_alive(th, &exit_val);
594  if (alive) {
595  ++thread_count;
596  }
597  }
598  if (thread_count == 0)
599  break; // success
600  }
601 
602  // Assume that I'm alone. Now it might be safe to check and reset locks.
603  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
604  __kmp_reset_lock(&__kmp_forkjoin_lock);
605 #ifdef KMP_DEBUG
606  __kmp_reset_lock(&__kmp_stdio_lock);
607 #endif // KMP_DEBUG
608 }
609 
610 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
611  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
612 
613  switch (fdwReason) {
614 
615  case DLL_PROCESS_ATTACH:
616  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
617 
618  return TRUE;
619 
620  case DLL_PROCESS_DETACH:
621  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
622 
623  if (lpReserved != NULL) {
624  // lpReserved is used for telling the difference:
625  // lpReserved == NULL when FreeLibrary() was called,
626  // lpReserved != NULL when the process terminates.
627  // When FreeLibrary() is called, worker threads remain alive. So they will
628  // release the forkjoin lock by themselves. When the process terminates,
629  // worker threads disappear triggering the problem of unreleased forkjoin
630  // lock as described below.
631 
632  // A worker thread can take the forkjoin lock. The problem comes up if
633  // that worker thread becomes dead before it releases the forkjoin lock.
634  // The forkjoin lock remains taken, while the thread executing
635  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
636  // to take the forkjoin lock and will always fail, so that the application
637  // will never finish [normally]. This scenario is possible if
638  // __kmpc_end() has not been executed. It looks like it's not a corner
639  // case, but common cases:
640  // - the main function was compiled by an alternative compiler;
641  // - the main function was compiled by icl but without /Qopenmp
642  // (application with plugins);
643  // - application terminates by calling C exit(), Fortran CALL EXIT() or
644  // Fortran STOP.
645  // - alive foreign thread prevented __kmpc_end from doing cleanup.
646  //
647  // This is a hack to work around the problem.
648  // TODO: !!! figure out something better.
649  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
650  }
651 
652  __kmp_internal_end_library(__kmp_gtid_get_specific());
653 
654  return TRUE;
655 
656  case DLL_THREAD_ATTACH:
657  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
658 
659  /* if we want to register new siblings all the time here call
660  * __kmp_get_gtid(); */
661  return TRUE;
662 
663  case DLL_THREAD_DETACH:
664  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
665 
666  __kmp_internal_end_thread(__kmp_gtid_get_specific());
667  return TRUE;
668  }
669 
670  return TRUE;
671 }
672 
673 #endif /* KMP_OS_WINDOWS */
674 #endif /* KMP_DYNAMIC_LIB */
675 
676 /* Change the library type to "status" and return the old type */
677 /* called from within initialization routines where __kmp_initz_lock is held */
678 int __kmp_change_library(int status) {
679  int old_status;
680 
681  old_status = __kmp_yield_init &
682  1; // check whether KMP_LIBRARY=throughput (even init count)
683 
684  if (status) {
685  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
686  } else {
687  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
688  }
689 
690  return old_status; // return previous setting of whether
691  // KMP_LIBRARY=throughput
692 }
693 
694 /* __kmp_parallel_deo -- Wait until it's our turn. */
695 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696  int gtid = *gtid_ref;
697 #ifdef BUILD_PARALLEL_ORDERED
698  kmp_team_t *team = __kmp_team_from_gtid(gtid);
699 #endif /* BUILD_PARALLEL_ORDERED */
700 
701  if (__kmp_env_consistency_check) {
702  if (__kmp_threads[gtid]->th.th_root->r.r_active)
703 #if KMP_USE_DYNAMIC_LOCK
704  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
705 #else
706  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
707 #endif
708  }
709 #ifdef BUILD_PARALLEL_ORDERED
710  if (!team->t.t_serialized) {
711  KMP_MB();
712  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
713  KMP_EQ, NULL);
714  KMP_MB();
715  }
716 #endif /* BUILD_PARALLEL_ORDERED */
717 }
718 
719 /* __kmp_parallel_dxo -- Signal the next task. */
720 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
721  int gtid = *gtid_ref;
722 #ifdef BUILD_PARALLEL_ORDERED
723  int tid = __kmp_tid_from_gtid(gtid);
724  kmp_team_t *team = __kmp_team_from_gtid(gtid);
725 #endif /* BUILD_PARALLEL_ORDERED */
726 
727  if (__kmp_env_consistency_check) {
728  if (__kmp_threads[gtid]->th.th_root->r.r_active)
729  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
730  }
731 #ifdef BUILD_PARALLEL_ORDERED
732  if (!team->t.t_serialized) {
733  KMP_MB(); /* Flush all pending memory write invalidates. */
734 
735  /* use the tid of the next thread in this team */
736  /* TODO replace with general release procedure */
737  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
738 
739  KMP_MB(); /* Flush all pending memory write invalidates. */
740  }
741 #endif /* BUILD_PARALLEL_ORDERED */
742 }
743 
744 /* ------------------------------------------------------------------------ */
745 /* The BARRIER for a SINGLE process section is always explicit */
746 
747 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
748  int status;
749  kmp_info_t *th;
750  kmp_team_t *team;
751 
752  if (!TCR_4(__kmp_init_parallel))
753  __kmp_parallel_initialize();
754 
755  th = __kmp_threads[gtid];
756  team = th->th.th_team;
757  status = 0;
758 
759  th->th.th_ident = id_ref;
760 
761  if (team->t.t_serialized) {
762  status = 1;
763  } else {
764  kmp_int32 old_this = th->th.th_local.this_construct;
765 
766  ++th->th.th_local.this_construct;
767  /* try to set team count to thread count--success means thread got the
768  single block */
769  /* TODO: Should this be acquire or release? */
770  if (team->t.t_construct == old_this) {
771  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
772  th->th.th_local.this_construct);
773  }
774 #if USE_ITT_BUILD
775  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
776  KMP_MASTER_GTID(gtid) &&
777 #if OMP_40_ENABLED
778  th->th.th_teams_microtask == NULL &&
779 #endif
780  team->t.t_active_level ==
781  1) { // Only report metadata by master of active team at level 1
782  __kmp_itt_metadata_single(id_ref);
783  }
784 #endif /* USE_ITT_BUILD */
785  }
786 
787  if (__kmp_env_consistency_check) {
788  if (status && push_ws) {
789  __kmp_push_workshare(gtid, ct_psingle, id_ref);
790  } else {
791  __kmp_check_workshare(gtid, ct_psingle, id_ref);
792  }
793  }
794 #if USE_ITT_BUILD
795  if (status) {
796  __kmp_itt_single_start(gtid);
797  }
798 #endif /* USE_ITT_BUILD */
799  return status;
800 }
801 
802 void __kmp_exit_single(int gtid) {
803 #if USE_ITT_BUILD
804  __kmp_itt_single_end(gtid);
805 #endif /* USE_ITT_BUILD */
806  if (__kmp_env_consistency_check)
807  __kmp_pop_workshare(gtid, ct_psingle, NULL);
808 }
809 
810 /* determine if we can go parallel or must use a serialized parallel region and
811  * how many threads we can use
812  * set_nproc is the number of threads requested for the team
813  * returns 0 if we should serialize or only use one thread,
814  * otherwise the number of threads to use
815  * The forkjoin lock is held by the caller. */
816 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
817  int master_tid, int set_nthreads
818 #if OMP_40_ENABLED
819  ,
820  int enter_teams
821 #endif /* OMP_40_ENABLED */
822  ) {
823  int capacity;
824  int new_nthreads;
825  KMP_DEBUG_ASSERT(__kmp_init_serial);
826  KMP_DEBUG_ASSERT(root && parent_team);
827 
828  // If dyn-var is set, dynamically adjust the number of desired threads,
829  // according to the method specified by dynamic_mode.
830  new_nthreads = set_nthreads;
831  if (!get__dynamic_2(parent_team, master_tid)) {
832  ;
833  }
834 #ifdef USE_LOAD_BALANCE
835  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
836  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
837  if (new_nthreads == 1) {
838  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
839  "reservation to 1 thread\n",
840  master_tid));
841  return 1;
842  }
843  if (new_nthreads < set_nthreads) {
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
845  "reservation to %d threads\n",
846  master_tid, new_nthreads));
847  }
848  }
849 #endif /* USE_LOAD_BALANCE */
850  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
851  new_nthreads = __kmp_avail_proc - __kmp_nth +
852  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
853  if (new_nthreads <= 1) {
854  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
855  "reservation to 1 thread\n",
856  master_tid));
857  return 1;
858  }
859  if (new_nthreads < set_nthreads) {
860  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
861  "reservation to %d threads\n",
862  master_tid, new_nthreads));
863  } else {
864  new_nthreads = set_nthreads;
865  }
866  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
867  if (set_nthreads > 2) {
868  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
869  new_nthreads = (new_nthreads % set_nthreads) + 1;
870  if (new_nthreads == 1) {
871  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
872  "reservation to 1 thread\n",
873  master_tid));
874  return 1;
875  }
876  if (new_nthreads < set_nthreads) {
877  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
878  "reservation to %d threads\n",
879  master_tid, new_nthreads));
880  }
881  }
882  } else {
883  KMP_ASSERT(0);
884  }
885 
886  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
887  if (__kmp_nth + new_nthreads -
888  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
889  __kmp_max_nth) {
890  int tl_nthreads = __kmp_max_nth - __kmp_nth +
891  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
892  if (tl_nthreads <= 0) {
893  tl_nthreads = 1;
894  }
895 
896  // If dyn-var is false, emit a 1-time warning.
897  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
898  __kmp_reserve_warn = 1;
899  __kmp_msg(kmp_ms_warning,
900  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
901  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
902  }
903  if (tl_nthreads == 1) {
904  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
905  "reduced reservation to 1 thread\n",
906  master_tid));
907  return 1;
908  }
909  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
910  "reservation to %d threads\n",
911  master_tid, tl_nthreads));
912  new_nthreads = tl_nthreads;
913  }
914 
915  // Respect OMP_THREAD_LIMIT
916  if (root->r.r_cg_nthreads + new_nthreads -
917  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
918  __kmp_cg_max_nth) {
919  int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
920  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
921  if (tl_nthreads <= 0) {
922  tl_nthreads = 1;
923  }
924 
925  // If dyn-var is false, emit a 1-time warning.
926  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
927  __kmp_reserve_warn = 1;
928  __kmp_msg(kmp_ms_warning,
929  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
930  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
931  }
932  if (tl_nthreads == 1) {
933  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
934  "reduced reservation to 1 thread\n",
935  master_tid));
936  return 1;
937  }
938  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
939  "reservation to %d threads\n",
940  master_tid, tl_nthreads));
941  new_nthreads = tl_nthreads;
942  }
943 
944  // Check if the threads array is large enough, or needs expanding.
945  // See comment in __kmp_register_root() about the adjustment if
946  // __kmp_threads[0] == NULL.
947  capacity = __kmp_threads_capacity;
948  if (TCR_PTR(__kmp_threads[0]) == NULL) {
949  --capacity;
950  }
951  if (__kmp_nth + new_nthreads -
952  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
953  capacity) {
954  // Expand the threads array.
955  int slotsRequired = __kmp_nth + new_nthreads -
956  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
957  capacity;
958  int slotsAdded = __kmp_expand_threads(slotsRequired);
959  if (slotsAdded < slotsRequired) {
960  // The threads array was not expanded enough.
961  new_nthreads -= (slotsRequired - slotsAdded);
962  KMP_ASSERT(new_nthreads >= 1);
963 
964  // If dyn-var is false, emit a 1-time warning.
965  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
966  __kmp_reserve_warn = 1;
967  if (__kmp_tp_cached) {
968  __kmp_msg(kmp_ms_warning,
969  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
970  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
971  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
972  } else {
973  __kmp_msg(kmp_ms_warning,
974  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
975  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
976  }
977  }
978  }
979  }
980 
981 #ifdef KMP_DEBUG
982  if (new_nthreads == 1) {
983  KC_TRACE(10,
984  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
985  "dead roots and rechecking; requested %d threads\n",
986  __kmp_get_gtid(), set_nthreads));
987  } else {
988  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
989  " %d threads\n",
990  __kmp_get_gtid(), new_nthreads, set_nthreads));
991  }
992 #endif // KMP_DEBUG
993  return new_nthreads;
994 }
995 
996 /* Allocate threads from the thread pool and assign them to the new team. We are
997  assured that there are enough threads available, because we checked on that
998  earlier within critical section forkjoin */
999 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
1000  kmp_info_t *master_th, int master_gtid) {
1001  int i;
1002  int use_hot_team;
1003 
1004  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
1005  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
1006  KMP_MB();
1007 
1008  /* first, let's setup the master thread */
1009  master_th->th.th_info.ds.ds_tid = 0;
1010  master_th->th.th_team = team;
1011  master_th->th.th_team_nproc = team->t.t_nproc;
1012  master_th->th.th_team_master = master_th;
1013  master_th->th.th_team_serialized = FALSE;
1014  master_th->th.th_dispatch = &team->t.t_dispatch[0];
1015 
1016 /* make sure we are not the optimized hot team */
1017 #if KMP_NESTED_HOT_TEAMS
1018  use_hot_team = 0;
1019  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1020  if (hot_teams) { // hot teams array is not allocated if
1021  // KMP_HOT_TEAMS_MAX_LEVEL=0
1022  int level = team->t.t_active_level - 1; // index in array of hot teams
1023  if (master_th->th.th_teams_microtask) { // are we inside the teams?
1024  if (master_th->th.th_teams_size.nteams > 1) {
1025  ++level; // level was not increased in teams construct for
1026  // team_of_masters
1027  }
1028  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1029  master_th->th.th_teams_level == team->t.t_level) {
1030  ++level; // level was not increased in teams construct for
1031  // team_of_workers before the parallel
1032  } // team->t.t_level will be increased inside parallel
1033  }
1034  if (level < __kmp_hot_teams_max_level) {
1035  if (hot_teams[level].hot_team) {
1036  // hot team has already been allocated for given level
1037  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1038  use_hot_team = 1; // the team is ready to use
1039  } else {
1040  use_hot_team = 0; // AC: threads are not allocated yet
1041  hot_teams[level].hot_team = team; // remember new hot team
1042  hot_teams[level].hot_team_nth = team->t.t_nproc;
1043  }
1044  } else {
1045  use_hot_team = 0;
1046  }
1047  }
1048 #else
1049  use_hot_team = team == root->r.r_hot_team;
1050 #endif
1051  if (!use_hot_team) {
1052 
1053  /* install the master thread */
1054  team->t.t_threads[0] = master_th;
1055  __kmp_initialize_info(master_th, team, 0, master_gtid);
1056 
1057  /* now, install the worker threads */
1058  for (i = 1; i < team->t.t_nproc; i++) {
1059 
1060  /* fork or reallocate a new thread and install it in team */
1061  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1062  team->t.t_threads[i] = thr;
1063  KMP_DEBUG_ASSERT(thr);
1064  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1065  /* align team and thread arrived states */
1066  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1067  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1068  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1069  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1070  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1071  team->t.t_bar[bs_plain_barrier].b_arrived));
1072 #if OMP_40_ENABLED
1073  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1074  thr->th.th_teams_level = master_th->th.th_teams_level;
1075  thr->th.th_teams_size = master_th->th.th_teams_size;
1076 #endif
1077  { // Initialize threads' barrier data.
1078  int b;
1079  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1080  for (b = 0; b < bs_last_barrier; ++b) {
1081  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1082  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1083 #if USE_DEBUGGER
1084  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1085 #endif
1086  }
1087  }
1088  }
1089 
1090 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1091  __kmp_partition_places(team);
1092 #endif
1093  }
1094 
1095  KMP_MB();
1096 }
1097 
1098 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1099 // Propagate any changes to the floating point control registers out to the team
1100 // We try to avoid unnecessary writes to the relevant cache line in the team
1101 // structure, so we don't make changes unless they are needed.
1102 inline static void propagateFPControl(kmp_team_t *team) {
1103  if (__kmp_inherit_fp_control) {
1104  kmp_int16 x87_fpu_control_word;
1105  kmp_uint32 mxcsr;
1106 
1107  // Get master values of FPU control flags (both X87 and vector)
1108  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109  __kmp_store_mxcsr(&mxcsr);
1110  mxcsr &= KMP_X86_MXCSR_MASK;
1111 
1112  // There is no point looking at t_fp_control_saved here.
1113  // If it is TRUE, we still have to update the values if they are different
1114  // from those we now have. If it is FALSE we didn't save anything yet, but
1115  // our objective is the same. We have to ensure that the values in the team
1116  // are the same as those we have.
1117  // So, this code achieves what we need whether or not t_fp_control_saved is
1118  // true. By checking whether the value needs updating we avoid unnecessary
1119  // writes that would put the cache-line into a written state, causing all
1120  // threads in the team to have to read it again.
1121  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1122  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1123  // Although we don't use this value, other code in the runtime wants to know
1124  // whether it should restore them. So we must ensure it is correct.
1125  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1126  } else {
1127  // Similarly here. Don't write to this cache-line in the team structure
1128  // unless we have to.
1129  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1130  }
1131 }
1132 
1133 // Do the opposite, setting the hardware registers to the updated values from
1134 // the team.
1135 inline static void updateHWFPControl(kmp_team_t *team) {
1136  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1137  // Only reset the fp control regs if they have been changed in the team.
1138  // the parallel region that we are exiting.
1139  kmp_int16 x87_fpu_control_word;
1140  kmp_uint32 mxcsr;
1141  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1142  __kmp_store_mxcsr(&mxcsr);
1143  mxcsr &= KMP_X86_MXCSR_MASK;
1144 
1145  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1146  __kmp_clear_x87_fpu_status_word();
1147  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1148  }
1149 
1150  if (team->t.t_mxcsr != mxcsr) {
1151  __kmp_load_mxcsr(&team->t.t_mxcsr);
1152  }
1153  }
1154 }
1155 #else
1156 #define propagateFPControl(x) ((void)0)
1157 #define updateHWFPControl(x) ((void)0)
1158 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1159 
1160 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1161  int realloc); // forward declaration
1162 
1163 /* Run a parallel region that has been serialized, so runs only in a team of the
1164  single master thread. */
1165 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1166  kmp_info_t *this_thr;
1167  kmp_team_t *serial_team;
1168 
1169  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1170 
1171  /* Skip all this code for autopar serialized loops since it results in
1172  unacceptable overhead */
1173  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1174  return;
1175 
1176  if (!TCR_4(__kmp_init_parallel))
1177  __kmp_parallel_initialize();
1178 
1179  this_thr = __kmp_threads[global_tid];
1180  serial_team = this_thr->th.th_serial_team;
1181 
1182  /* utilize the serialized team held by this thread */
1183  KMP_DEBUG_ASSERT(serial_team);
1184  KMP_MB();
1185 
1186  if (__kmp_tasking_mode != tskm_immediate_exec) {
1187  KMP_DEBUG_ASSERT(
1188  this_thr->th.th_task_team ==
1189  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1190  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1191  NULL);
1192  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1193  "team %p, new task_team = NULL\n",
1194  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1195  this_thr->th.th_task_team = NULL;
1196  }
1197 
1198 #if OMP_40_ENABLED
1199  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1200  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1201  proc_bind = proc_bind_false;
1202  } else if (proc_bind == proc_bind_default) {
1203  // No proc_bind clause was specified, so use the current value
1204  // of proc-bind-var for this parallel region.
1205  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1206  }
1207  // Reset for next parallel region
1208  this_thr->th.th_set_proc_bind = proc_bind_default;
1209 #endif /* OMP_40_ENABLED */
1210 
1211 #if OMPT_SUPPORT
1212  ompt_data_t ompt_parallel_data = ompt_data_none;
1213  ompt_data_t *implicit_task_data;
1214  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1215  if (ompt_enabled.enabled &&
1216  this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1217 
1218  ompt_task_info_t *parent_task_info;
1219  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1220 
1221  parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1222  if (ompt_enabled.ompt_callback_parallel_begin) {
1223  int team_size = 1;
1224 
1225  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1226  &(parent_task_info->task_data), &(parent_task_info->frame),
1227  &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1228  codeptr);
1229  }
1230  }
1231 #endif // OMPT_SUPPORT
1232 
1233  if (this_thr->th.th_team != serial_team) {
1234  // Nested level will be an index in the nested nthreads array
1235  int level = this_thr->th.th_team->t.t_level;
1236 
1237  if (serial_team->t.t_serialized) {
1238  /* this serial team was already used
1239  TODO increase performance by making this locks more specific */
1240  kmp_team_t *new_team;
1241 
1242  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1243 
1244  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1245 #if OMPT_SUPPORT
1246  ompt_parallel_data,
1247 #endif
1248 #if OMP_40_ENABLED
1249  proc_bind,
1250 #endif
1251  &this_thr->th.th_current_task->td_icvs,
1252  0 USE_NESTED_HOT_ARG(NULL));
1253  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1254  KMP_ASSERT(new_team);
1255 
1256  /* setup new serialized team and install it */
1257  new_team->t.t_threads[0] = this_thr;
1258  new_team->t.t_parent = this_thr->th.th_team;
1259  serial_team = new_team;
1260  this_thr->th.th_serial_team = serial_team;
1261 
1262  KF_TRACE(
1263  10,
1264  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1265  global_tid, serial_team));
1266 
1267  /* TODO the above breaks the requirement that if we run out of resources,
1268  then we can still guarantee that serialized teams are ok, since we may
1269  need to allocate a new one */
1270  } else {
1271  KF_TRACE(
1272  10,
1273  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1274  global_tid, serial_team));
1275  }
1276 
1277  /* we have to initialize this serial team */
1278  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1279  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1280  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1281  serial_team->t.t_ident = loc;
1282  serial_team->t.t_serialized = 1;
1283  serial_team->t.t_nproc = 1;
1284  serial_team->t.t_parent = this_thr->th.th_team;
1285  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1286  this_thr->th.th_team = serial_team;
1287  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1288 
1289  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1290  this_thr->th.th_current_task));
1291  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1292  this_thr->th.th_current_task->td_flags.executing = 0;
1293 
1294  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1295 
1296  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1297  implicit task for each serialized task represented by
1298  team->t.t_serialized? */
1299  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1300  &this_thr->th.th_current_task->td_parent->td_icvs);
1301 
1302  // Thread value exists in the nested nthreads array for the next nested
1303  // level
1304  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1305  this_thr->th.th_current_task->td_icvs.nproc =
1306  __kmp_nested_nth.nth[level + 1];
1307  }
1308 
1309 #if OMP_40_ENABLED
1310  if (__kmp_nested_proc_bind.used &&
1311  (level + 1 < __kmp_nested_proc_bind.used)) {
1312  this_thr->th.th_current_task->td_icvs.proc_bind =
1313  __kmp_nested_proc_bind.bind_types[level + 1];
1314  }
1315 #endif /* OMP_40_ENABLED */
1316 
1317 #if USE_DEBUGGER
1318  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1319 #endif
1320  this_thr->th.th_info.ds.ds_tid = 0;
1321 
1322  /* set thread cache values */
1323  this_thr->th.th_team_nproc = 1;
1324  this_thr->th.th_team_master = this_thr;
1325  this_thr->th.th_team_serialized = 1;
1326 
1327  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1328  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1329 #if OMP_50_ENABLED
1330  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1331 #endif
1332 
1333  propagateFPControl(serial_team);
1334 
1335  /* check if we need to allocate dispatch buffers stack */
1336  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1337  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1338  serial_team->t.t_dispatch->th_disp_buffer =
1339  (dispatch_private_info_t *)__kmp_allocate(
1340  sizeof(dispatch_private_info_t));
1341  }
1342  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1343 
1344  KMP_MB();
1345 
1346  } else {
1347  /* this serialized team is already being used,
1348  * that's fine, just add another nested level */
1349  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1350  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1351  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1352  ++serial_team->t.t_serialized;
1353  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1354 
1355  // Nested level will be an index in the nested nthreads array
1356  int level = this_thr->th.th_team->t.t_level;
1357  // Thread value exists in the nested nthreads array for the next nested
1358  // level
1359  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1360  this_thr->th.th_current_task->td_icvs.nproc =
1361  __kmp_nested_nth.nth[level + 1];
1362  }
1363  serial_team->t.t_level++;
1364  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1365  "of serial team %p to %d\n",
1366  global_tid, serial_team, serial_team->t.t_level));
1367 
1368  /* allocate/push dispatch buffers stack */
1369  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1370  {
1371  dispatch_private_info_t *disp_buffer =
1372  (dispatch_private_info_t *)__kmp_allocate(
1373  sizeof(dispatch_private_info_t));
1374  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1375  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1376  }
1377  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1378 
1379  KMP_MB();
1380  }
1381 #if OMP_40_ENABLED
1382  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1383 #endif
1384 
1385  if (__kmp_env_consistency_check)
1386  __kmp_push_parallel(global_tid, NULL);
1387 #if OMPT_SUPPORT
1388  serial_team->t.ompt_team_info.master_return_address = codeptr;
1389  if (ompt_enabled.enabled &&
1390  this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1391  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1392 
1393  ompt_lw_taskteam_t lw_taskteam;
1394  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1395  &ompt_parallel_data, codeptr);
1396 
1397  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1398  // don't use lw_taskteam after linking. content was swaped
1399 
1400  /* OMPT implicit task begin */
1401  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1402  if (ompt_enabled.ompt_callback_implicit_task) {
1403  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1404  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1405  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1406  OMPT_CUR_TASK_INFO(this_thr)
1407  ->thread_num = __kmp_tid_from_gtid(global_tid);
1408  }
1409 
1410  /* OMPT state */
1411  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1412  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1413  }
1414 #endif
1415 }
1416 
1417 /* most of the work for a fork */
1418 /* return true if we really went parallel, false if serialized */
1419 int __kmp_fork_call(ident_t *loc, int gtid,
1420  enum fork_context_e call_context, // Intel, GNU, ...
1421  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1422 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1423 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1424  va_list *ap
1425 #else
1426  va_list ap
1427 #endif
1428  ) {
1429  void **argv;
1430  int i;
1431  int master_tid;
1432  int master_this_cons;
1433  kmp_team_t *team;
1434  kmp_team_t *parent_team;
1435  kmp_info_t *master_th;
1436  kmp_root_t *root;
1437  int nthreads;
1438  int master_active;
1439  int master_set_numthreads;
1440  int level;
1441 #if OMP_40_ENABLED
1442  int active_level;
1443  int teams_level;
1444 #endif
1445 #if KMP_NESTED_HOT_TEAMS
1446  kmp_hot_team_ptr_t **p_hot_teams;
1447 #endif
1448  { // KMP_TIME_BLOCK
1449  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1450  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1451 
1452  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1453  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1454  /* Some systems prefer the stack for the root thread(s) to start with */
1455  /* some gap from the parent stack to prevent false sharing. */
1456  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1457  /* These 2 lines below are so this does not get optimized out */
1458  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1459  __kmp_stkpadding += (short)((kmp_int64)dummy);
1460  }
1461 
1462  /* initialize if needed */
1463  KMP_DEBUG_ASSERT(
1464  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1465  if (!TCR_4(__kmp_init_parallel))
1466  __kmp_parallel_initialize();
1467 
1468  /* setup current data */
1469  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1470  // shutdown
1471  parent_team = master_th->th.th_team;
1472  master_tid = master_th->th.th_info.ds.ds_tid;
1473  master_this_cons = master_th->th.th_local.this_construct;
1474  root = master_th->th.th_root;
1475  master_active = root->r.r_active;
1476  master_set_numthreads = master_th->th.th_set_nproc;
1477 
1478 #if OMPT_SUPPORT
1479  ompt_data_t ompt_parallel_data = ompt_data_none;
1480  ompt_data_t *parent_task_data;
1481  omp_frame_t *ompt_frame;
1482  ompt_data_t *implicit_task_data;
1483  void *return_address = NULL;
1484 
1485  if (ompt_enabled.enabled) {
1486  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1487  NULL, NULL);
1488  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1489  }
1490 #endif
1491 
1492  // Nested level will be an index in the nested nthreads array
1493  level = parent_team->t.t_level;
1494  // used to launch non-serial teams even if nested is not allowed
1495  active_level = parent_team->t.t_active_level;
1496 #if OMP_40_ENABLED
1497  // needed to check nesting inside the teams
1498  teams_level = master_th->th.th_teams_level;
1499 #endif
1500 #if KMP_NESTED_HOT_TEAMS
1501  p_hot_teams = &master_th->th.th_hot_teams;
1502  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1503  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1504  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1505  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1506  // it is either actual or not needed (when active_level > 0)
1507  (*p_hot_teams)[0].hot_team_nth = 1;
1508  }
1509 #endif
1510 
1511 #if OMPT_SUPPORT
1512  if (ompt_enabled.enabled) {
1513  if (ompt_enabled.ompt_callback_parallel_begin) {
1514  int team_size = master_set_numthreads
1515  ? master_set_numthreads
1516  : get__nproc_2(parent_team, master_tid);
1517  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1518  parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1519  OMPT_INVOKER(call_context), return_address);
1520  }
1521  master_th->th.ompt_thread_info.state = omp_state_overhead;
1522  }
1523 #endif
1524 
1525  master_th->th.th_ident = loc;
1526 
1527 #if OMP_40_ENABLED
1528  if (master_th->th.th_teams_microtask && ap &&
1529  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1530  // AC: This is start of parallel that is nested inside teams construct.
1531  // The team is actual (hot), all workers are ready at the fork barrier.
1532  // No lock needed to initialize the team a bit, then free workers.
1533  parent_team->t.t_ident = loc;
1534  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1535  parent_team->t.t_argc = argc;
1536  argv = (void **)parent_team->t.t_argv;
1537  for (i = argc - 1; i >= 0; --i)
1538 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1539 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1540  *argv++ = va_arg(*ap, void *);
1541 #else
1542  *argv++ = va_arg(ap, void *);
1543 #endif
1544  // Increment our nested depth levels, but not increase the serialization
1545  if (parent_team == master_th->th.th_serial_team) {
1546  // AC: we are in serialized parallel
1547  __kmpc_serialized_parallel(loc, gtid);
1548  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1549  // AC: need this in order enquiry functions work
1550  // correctly, will restore at join time
1551  parent_team->t.t_serialized--;
1552 #if OMPT_SUPPORT
1553  void *dummy;
1554  void **exit_runtime_p;
1555 
1556  ompt_lw_taskteam_t lw_taskteam;
1557 
1558  if (ompt_enabled.enabled) {
1559  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1560  &ompt_parallel_data, return_address);
1561  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame);
1562 
1563  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1564  // don't use lw_taskteam after linking. content was swaped
1565 
1566  /* OMPT implicit task begin */
1567  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1568  if (ompt_enabled.ompt_callback_implicit_task) {
1569  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1570  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1571  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1572  OMPT_CUR_TASK_INFO(master_th)
1573  ->thread_num = __kmp_tid_from_gtid(gtid);
1574  }
1575 
1576  /* OMPT state */
1577  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1578  } else {
1579  exit_runtime_p = &dummy;
1580  }
1581 #endif
1582 
1583  {
1584  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1585  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1586  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1587 #if OMPT_SUPPORT
1588  ,
1589  exit_runtime_p
1590 #endif
1591  );
1592  }
1593 
1594 #if OMPT_SUPPORT
1595  *exit_runtime_p = NULL;
1596  if (ompt_enabled.enabled) {
1597  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL;
1598  if (ompt_enabled.ompt_callback_implicit_task) {
1599  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1600  ompt_scope_end, NULL, implicit_task_data, 1,
1601  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1602  }
1603  __ompt_lw_taskteam_unlink(master_th);
1604 
1605  if (ompt_enabled.ompt_callback_parallel_end) {
1606  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1607  OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1608  OMPT_INVOKER(call_context), return_address);
1609  }
1610  master_th->th.ompt_thread_info.state = omp_state_overhead;
1611  }
1612 #endif
1613  return TRUE;
1614  }
1615 
1616  parent_team->t.t_pkfn = microtask;
1617  parent_team->t.t_invoke = invoker;
1618  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1619  parent_team->t.t_active_level++;
1620  parent_team->t.t_level++;
1621 #if OMP_50_ENABLED
1622  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1623 #endif
1624 
1625  /* Change number of threads in the team if requested */
1626  if (master_set_numthreads) { // The parallel has num_threads clause
1627  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1628  // AC: only can reduce number of threads dynamically, can't increase
1629  kmp_info_t **other_threads = parent_team->t.t_threads;
1630  parent_team->t.t_nproc = master_set_numthreads;
1631  for (i = 0; i < master_set_numthreads; ++i) {
1632  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1633  }
1634  // Keep extra threads hot in the team for possible next parallels
1635  }
1636  master_th->th.th_set_nproc = 0;
1637  }
1638 
1639 #if USE_DEBUGGER
1640  if (__kmp_debugging) { // Let debugger override number of threads.
1641  int nth = __kmp_omp_num_threads(loc);
1642  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1643  master_set_numthreads = nth;
1644  }
1645  }
1646 #endif
1647 
1648  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1649  "master_th=%p, gtid=%d\n",
1650  root, parent_team, master_th, gtid));
1651  __kmp_internal_fork(loc, gtid, parent_team);
1652  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1653  "master_th=%p, gtid=%d\n",
1654  root, parent_team, master_th, gtid));
1655 
1656  /* Invoke microtask for MASTER thread */
1657  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1658  parent_team->t.t_id, parent_team->t.t_pkfn));
1659 
1660  if (!parent_team->t.t_invoke(gtid)) {
1661  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1662  }
1663  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1664  parent_team->t.t_id, parent_team->t.t_pkfn));
1665  KMP_MB(); /* Flush all pending memory write invalidates. */
1666 
1667  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1668 
1669  return TRUE;
1670  } // Parallel closely nested in teams construct
1671 #endif /* OMP_40_ENABLED */
1672 
1673 #if KMP_DEBUG
1674  if (__kmp_tasking_mode != tskm_immediate_exec) {
1675  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1676  parent_team->t.t_task_team[master_th->th.th_task_state]);
1677  }
1678 #endif
1679 
1680  if (parent_team->t.t_active_level >=
1681  master_th->th.th_current_task->td_icvs.max_active_levels) {
1682  nthreads = 1;
1683  } else {
1684 #if OMP_40_ENABLED
1685  int enter_teams = ((ap == NULL && active_level == 0) ||
1686  (ap && teams_level > 0 && teams_level == level));
1687 #endif
1688  nthreads =
1689  master_set_numthreads
1690  ? master_set_numthreads
1691  : get__nproc_2(
1692  parent_team,
1693  master_tid); // TODO: get nproc directly from current task
1694 
1695  // Check if we need to take forkjoin lock? (no need for serialized
1696  // parallel out of teams construct). This code moved here from
1697  // __kmp_reserve_threads() to speedup nested serialized parallels.
1698  if (nthreads > 1) {
1699  if ((!get__nested(master_th) && (root->r.r_in_parallel
1700 #if OMP_40_ENABLED
1701  && !enter_teams
1702 #endif /* OMP_40_ENABLED */
1703  )) ||
1704  (__kmp_library == library_serial)) {
1705  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1706  " threads\n",
1707  gtid, nthreads));
1708  nthreads = 1;
1709  }
1710  }
1711  if (nthreads > 1) {
1712  /* determine how many new threads we can use */
1713  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1714  nthreads = __kmp_reserve_threads(
1715  root, parent_team, master_tid, nthreads
1716 #if OMP_40_ENABLED
1717  /* AC: If we execute teams from parallel region (on host), then
1718  teams should be created but each can only have 1 thread if
1719  nesting is disabled. If teams called from serial region, then
1720  teams and their threads should be created regardless of the
1721  nesting setting. */
1722  ,
1723  enter_teams
1724 #endif /* OMP_40_ENABLED */
1725  );
1726  if (nthreads == 1) {
1727  // Free lock for single thread execution here; for multi-thread
1728  // execution it will be freed later after team of threads created
1729  // and initialized
1730  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1731  }
1732  }
1733  }
1734  KMP_DEBUG_ASSERT(nthreads > 0);
1735 
1736  // If we temporarily changed the set number of threads then restore it now
1737  master_th->th.th_set_nproc = 0;
1738 
1739  /* create a serialized parallel region? */
1740  if (nthreads == 1) {
1741 /* josh todo: hypothetical question: what do we do for OS X*? */
1742 #if KMP_OS_LINUX && \
1743  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1744  void *args[argc];
1745 #else
1746  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1747 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1748  KMP_ARCH_AARCH64) */
1749 
1750  KA_TRACE(20,
1751  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1752 
1753  __kmpc_serialized_parallel(loc, gtid);
1754 
1755  if (call_context == fork_context_intel) {
1756  /* TODO this sucks, use the compiler itself to pass args! :) */
1757  master_th->th.th_serial_team->t.t_ident = loc;
1758 #if OMP_40_ENABLED
1759  if (!ap) {
1760  // revert change made in __kmpc_serialized_parallel()
1761  master_th->th.th_serial_team->t.t_level--;
1762 // Get args from parent team for teams construct
1763 
1764 #if OMPT_SUPPORT
1765  void *dummy;
1766  void **exit_runtime_p;
1767  ompt_task_info_t *task_info;
1768 
1769  ompt_lw_taskteam_t lw_taskteam;
1770 
1771  if (ompt_enabled.enabled) {
1772  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1773  &ompt_parallel_data, return_address);
1774 
1775  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1776  // don't use lw_taskteam after linking. content was swaped
1777 
1778  task_info = OMPT_CUR_TASK_INFO(master_th);
1779  exit_runtime_p = &(task_info->frame.exit_frame);
1780  if (ompt_enabled.ompt_callback_implicit_task) {
1781  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1782  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1783  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1784  OMPT_CUR_TASK_INFO(master_th)
1785  ->thread_num = __kmp_tid_from_gtid(gtid);
1786  }
1787 
1788  /* OMPT state */
1789  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1790  } else {
1791  exit_runtime_p = &dummy;
1792  }
1793 #endif
1794 
1795  {
1796  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1797  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1798  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1799  parent_team->t.t_argv
1800 #if OMPT_SUPPORT
1801  ,
1802  exit_runtime_p
1803 #endif
1804  );
1805  }
1806 
1807 #if OMPT_SUPPORT
1808  if (ompt_enabled.enabled) {
1809  exit_runtime_p = NULL;
1810  if (ompt_enabled.ompt_callback_implicit_task) {
1811  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1812  ompt_scope_end, NULL, &(task_info->task_data), 1,
1813  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1814  }
1815 
1816  __ompt_lw_taskteam_unlink(master_th);
1817  if (ompt_enabled.ompt_callback_parallel_end) {
1818  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1819  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1820  OMPT_INVOKER(call_context), return_address);
1821  }
1822  master_th->th.ompt_thread_info.state = omp_state_overhead;
1823  }
1824 #endif
1825  } else if (microtask == (microtask_t)__kmp_teams_master) {
1826  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1827  master_th->th.th_serial_team);
1828  team = master_th->th.th_team;
1829  // team->t.t_pkfn = microtask;
1830  team->t.t_invoke = invoker;
1831  __kmp_alloc_argv_entries(argc, team, TRUE);
1832  team->t.t_argc = argc;
1833  argv = (void **)team->t.t_argv;
1834  if (ap) {
1835  for (i = argc - 1; i >= 0; --i)
1836 // TODO: revert workaround for Intel(R) 64 tracker #96
1837 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1838  *argv++ = va_arg(*ap, void *);
1839 #else
1840  *argv++ = va_arg(ap, void *);
1841 #endif
1842  } else {
1843  for (i = 0; i < argc; ++i)
1844  // Get args from parent team for teams construct
1845  argv[i] = parent_team->t.t_argv[i];
1846  }
1847  // AC: revert change made in __kmpc_serialized_parallel()
1848  // because initial code in teams should have level=0
1849  team->t.t_level--;
1850  // AC: call special invoker for outer "parallel" of teams construct
1851  invoker(gtid);
1852  } else {
1853 #endif /* OMP_40_ENABLED */
1854  argv = args;
1855  for (i = argc - 1; i >= 0; --i)
1856 // TODO: revert workaround for Intel(R) 64 tracker #96
1857 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1858  *argv++ = va_arg(*ap, void *);
1859 #else
1860  *argv++ = va_arg(ap, void *);
1861 #endif
1862  KMP_MB();
1863 
1864 #if OMPT_SUPPORT
1865  void *dummy;
1866  void **exit_runtime_p;
1867  ompt_task_info_t *task_info;
1868 
1869  ompt_lw_taskteam_t lw_taskteam;
1870 
1871  if (ompt_enabled.enabled) {
1872  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1873  &ompt_parallel_data, return_address);
1874  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1875  // don't use lw_taskteam after linking. content was swaped
1876  task_info = OMPT_CUR_TASK_INFO(master_th);
1877  exit_runtime_p = &(task_info->frame.exit_frame);
1878 
1879  /* OMPT implicit task begin */
1880  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1881  if (ompt_enabled.ompt_callback_implicit_task) {
1882  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1883  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1884  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1885  OMPT_CUR_TASK_INFO(master_th)
1886  ->thread_num = __kmp_tid_from_gtid(gtid);
1887  }
1888 
1889  /* OMPT state */
1890  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1891  } else {
1892  exit_runtime_p = &dummy;
1893  }
1894 #endif
1895 
1896  {
1897  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1898  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1899  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1900 #if OMPT_SUPPORT
1901  ,
1902  exit_runtime_p
1903 #endif
1904  );
1905  }
1906 
1907 #if OMPT_SUPPORT
1908  if (ompt_enabled.enabled) {
1909  *exit_runtime_p = NULL;
1910  if (ompt_enabled.ompt_callback_implicit_task) {
1911  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1912  ompt_scope_end, NULL, &(task_info->task_data), 1,
1913  OMPT_CUR_TASK_INFO(master_th)->thread_num);
1914  }
1915 
1916  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1917  __ompt_lw_taskteam_unlink(master_th);
1918  if (ompt_enabled.ompt_callback_parallel_end) {
1919  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1920  &ompt_parallel_data, parent_task_data,
1921  OMPT_INVOKER(call_context), return_address);
1922  }
1923  master_th->th.ompt_thread_info.state = omp_state_overhead;
1924  }
1925 #endif
1926 #if OMP_40_ENABLED
1927  }
1928 #endif /* OMP_40_ENABLED */
1929  } else if (call_context == fork_context_gnu) {
1930 #if OMPT_SUPPORT
1931  ompt_lw_taskteam_t lwt;
1932  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1933  return_address);
1934 
1935  lwt.ompt_task_info.frame.exit_frame = NULL;
1936  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1937 // don't use lw_taskteam after linking. content was swaped
1938 #endif
1939 
1940  // we were called from GNU native code
1941  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1942  return FALSE;
1943  } else {
1944  KMP_ASSERT2(call_context < fork_context_last,
1945  "__kmp_fork_call: unknown fork_context parameter");
1946  }
1947 
1948  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1949  KMP_MB();
1950  return FALSE;
1951  }
1952 
1953  // GEH: only modify the executing flag in the case when not serialized
1954  // serialized case is handled in kmpc_serialized_parallel
1955  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1956  "curtask=%p, curtask_max_aclevel=%d\n",
1957  parent_team->t.t_active_level, master_th,
1958  master_th->th.th_current_task,
1959  master_th->th.th_current_task->td_icvs.max_active_levels));
1960  // TODO: GEH - cannot do this assertion because root thread not set up as
1961  // executing
1962  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1963  master_th->th.th_current_task->td_flags.executing = 0;
1964 
1965 #if OMP_40_ENABLED
1966  if (!master_th->th.th_teams_microtask || level > teams_level)
1967 #endif /* OMP_40_ENABLED */
1968  {
1969  /* Increment our nested depth level */
1970  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1971  }
1972 
1973  // See if we need to make a copy of the ICVs.
1974  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1975  if ((level + 1 < __kmp_nested_nth.used) &&
1976  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1977  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1978  } else {
1979  nthreads_icv = 0; // don't update
1980  }
1981 
1982 #if OMP_40_ENABLED
1983  // Figure out the proc_bind_policy for the new team.
1984  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1985  kmp_proc_bind_t proc_bind_icv =
1986  proc_bind_default; // proc_bind_default means don't update
1987  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1988  proc_bind = proc_bind_false;
1989  } else {
1990  if (proc_bind == proc_bind_default) {
1991  // No proc_bind clause specified; use current proc-bind-var for this
1992  // parallel region
1993  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1994  }
1995  /* else: The proc_bind policy was specified explicitly on parallel clause.
1996  This overrides proc-bind-var for this parallel region, but does not
1997  change proc-bind-var. */
1998  // Figure the value of proc-bind-var for the child threads.
1999  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2000  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2001  master_th->th.th_current_task->td_icvs.proc_bind)) {
2002  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2003  }
2004  }
2005 
2006  // Reset for next parallel region
2007  master_th->th.th_set_proc_bind = proc_bind_default;
2008 #endif /* OMP_40_ENABLED */
2009 
2010  if ((nthreads_icv > 0)
2011 #if OMP_40_ENABLED
2012  || (proc_bind_icv != proc_bind_default)
2013 #endif /* OMP_40_ENABLED */
2014  ) {
2015  kmp_internal_control_t new_icvs;
2016  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2017  new_icvs.next = NULL;
2018  if (nthreads_icv > 0) {
2019  new_icvs.nproc = nthreads_icv;
2020  }
2021 
2022 #if OMP_40_ENABLED
2023  if (proc_bind_icv != proc_bind_default) {
2024  new_icvs.proc_bind = proc_bind_icv;
2025  }
2026 #endif /* OMP_40_ENABLED */
2027 
2028  /* allocate a new parallel team */
2029  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2030  team = __kmp_allocate_team(root, nthreads, nthreads,
2031 #if OMPT_SUPPORT
2032  ompt_parallel_data,
2033 #endif
2034 #if OMP_40_ENABLED
2035  proc_bind,
2036 #endif
2037  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2038  } else {
2039  /* allocate a new parallel team */
2040  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2041  team = __kmp_allocate_team(root, nthreads, nthreads,
2042 #if OMPT_SUPPORT
2043  ompt_parallel_data,
2044 #endif
2045 #if OMP_40_ENABLED
2046  proc_bind,
2047 #endif
2048  &master_th->th.th_current_task->td_icvs,
2049  argc USE_NESTED_HOT_ARG(master_th));
2050  }
2051  KF_TRACE(
2052  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2053 
2054  /* setup the new team */
2055  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2056  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2057  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2058  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2059  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2060 #if OMPT_SUPPORT
2061  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2062  return_address);
2063 #endif
2064  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2065 // TODO: parent_team->t.t_level == INT_MAX ???
2066 #if OMP_40_ENABLED
2067  if (!master_th->th.th_teams_microtask || level > teams_level) {
2068 #endif /* OMP_40_ENABLED */
2069  int new_level = parent_team->t.t_level + 1;
2070  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2071  new_level = parent_team->t.t_active_level + 1;
2072  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2073 #if OMP_40_ENABLED
2074  } else {
2075  // AC: Do not increase parallel level at start of the teams construct
2076  int new_level = parent_team->t.t_level;
2077  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2078  new_level = parent_team->t.t_active_level;
2079  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2080  }
2081 #endif /* OMP_40_ENABLED */
2082  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2083  // set master's schedule as new run-time schedule
2084  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2085 
2086 #if OMP_40_ENABLED
2087  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2088 #endif
2089 #if OMP_50_ENABLED
2090  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2091 #endif
2092 
2093  // Update the floating point rounding in the team if required.
2094  propagateFPControl(team);
2095 
2096  if (__kmp_tasking_mode != tskm_immediate_exec) {
2097  // Set master's task team to team's task team. Unless this is hot team, it
2098  // should be NULL.
2099  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2100  parent_team->t.t_task_team[master_th->th.th_task_state]);
2101  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2102  "%p, new task_team %p / team %p\n",
2103  __kmp_gtid_from_thread(master_th),
2104  master_th->th.th_task_team, parent_team,
2105  team->t.t_task_team[master_th->th.th_task_state], team));
2106 
2107  if (active_level || master_th->th.th_task_team) {
2108  // Take a memo of master's task_state
2109  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2110  if (master_th->th.th_task_state_top >=
2111  master_th->th.th_task_state_stack_sz) { // increase size
2112  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2113  kmp_uint8 *old_stack, *new_stack;
2114  kmp_uint32 i;
2115  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2116  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2117  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2118  }
2119  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2120  ++i) { // zero-init rest of stack
2121  new_stack[i] = 0;
2122  }
2123  old_stack = master_th->th.th_task_state_memo_stack;
2124  master_th->th.th_task_state_memo_stack = new_stack;
2125  master_th->th.th_task_state_stack_sz = new_size;
2126  __kmp_free(old_stack);
2127  }
2128  // Store master's task_state on stack
2129  master_th->th
2130  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2131  master_th->th.th_task_state;
2132  master_th->th.th_task_state_top++;
2133 #if KMP_NESTED_HOT_TEAMS
2134  if (master_th->th.th_hot_teams &&
2135  team == master_th->th.th_hot_teams[active_level].hot_team) {
2136  // Restore master's nested state if nested hot team
2137  master_th->th.th_task_state =
2138  master_th->th
2139  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2140  } else {
2141 #endif
2142  master_th->th.th_task_state = 0;
2143 #if KMP_NESTED_HOT_TEAMS
2144  }
2145 #endif
2146  }
2147 #if !KMP_NESTED_HOT_TEAMS
2148  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2149  (team == root->r.r_hot_team));
2150 #endif
2151  }
2152 
2153  KA_TRACE(
2154  20,
2155  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2156  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2157  team->t.t_nproc));
2158  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2159  (team->t.t_master_tid == 0 &&
2160  (team->t.t_parent == root->r.r_root_team ||
2161  team->t.t_parent->t.t_serialized)));
2162  KMP_MB();
2163 
2164  /* now, setup the arguments */
2165  argv = (void **)team->t.t_argv;
2166 #if OMP_40_ENABLED
2167  if (ap) {
2168 #endif /* OMP_40_ENABLED */
2169  for (i = argc - 1; i >= 0; --i) {
2170 // TODO: revert workaround for Intel(R) 64 tracker #96
2171 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2172  void *new_argv = va_arg(*ap, void *);
2173 #else
2174  void *new_argv = va_arg(ap, void *);
2175 #endif
2176  KMP_CHECK_UPDATE(*argv, new_argv);
2177  argv++;
2178  }
2179 #if OMP_40_ENABLED
2180  } else {
2181  for (i = 0; i < argc; ++i) {
2182  // Get args from parent team for teams construct
2183  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2184  }
2185  }
2186 #endif /* OMP_40_ENABLED */
2187 
2188  /* now actually fork the threads */
2189  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2190  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2191  root->r.r_active = TRUE;
2192 
2193  __kmp_fork_team_threads(root, team, master_th, gtid);
2194  __kmp_setup_icv_copy(team, nthreads,
2195  &master_th->th.th_current_task->td_icvs, loc);
2196 
2197 #if OMPT_SUPPORT
2198  master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2199 #endif
2200 
2201  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2202 
2203 #if USE_ITT_BUILD
2204  if (team->t.t_active_level == 1 // only report frames at level 1
2205 #if OMP_40_ENABLED
2206  && !master_th->th.th_teams_microtask // not in teams construct
2207 #endif /* OMP_40_ENABLED */
2208  ) {
2209 #if USE_ITT_NOTIFY
2210  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2211  (__kmp_forkjoin_frames_mode == 3 ||
2212  __kmp_forkjoin_frames_mode == 1)) {
2213  kmp_uint64 tmp_time = 0;
2214  if (__itt_get_timestamp_ptr)
2215  tmp_time = __itt_get_timestamp();
2216  // Internal fork - report frame begin
2217  master_th->th.th_frame_time = tmp_time;
2218  if (__kmp_forkjoin_frames_mode == 3)
2219  team->t.t_region_time = tmp_time;
2220  } else
2221 // only one notification scheme (either "submit" or "forking/joined", not both)
2222 #endif /* USE_ITT_NOTIFY */
2223  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2224  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2225  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2226  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2227  }
2228  }
2229 #endif /* USE_ITT_BUILD */
2230 
2231  /* now go on and do the work */
2232  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2233  KMP_MB();
2234  KF_TRACE(10,
2235  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2236  root, team, master_th, gtid));
2237 
2238 #if USE_ITT_BUILD
2239  if (__itt_stack_caller_create_ptr) {
2240  team->t.t_stack_id =
2241  __kmp_itt_stack_caller_create(); // create new stack stitching id
2242  // before entering fork barrier
2243  }
2244 #endif /* USE_ITT_BUILD */
2245 
2246 #if OMP_40_ENABLED
2247  // AC: skip __kmp_internal_fork at teams construct, let only master
2248  // threads execute
2249  if (ap)
2250 #endif /* OMP_40_ENABLED */
2251  {
2252  __kmp_internal_fork(loc, gtid, team);
2253  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2254  "master_th=%p, gtid=%d\n",
2255  root, team, master_th, gtid));
2256  }
2257 
2258  if (call_context == fork_context_gnu) {
2259  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2260  return TRUE;
2261  }
2262 
2263  /* Invoke microtask for MASTER thread */
2264  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2265  team->t.t_id, team->t.t_pkfn));
2266  } // END of timer KMP_fork_call block
2267 
2268  if (!team->t.t_invoke(gtid)) {
2269  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2270  }
2271  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2272  team->t.t_id, team->t.t_pkfn));
2273  KMP_MB(); /* Flush all pending memory write invalidates. */
2274 
2275  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2276 
2277 #if OMPT_SUPPORT
2278  if (ompt_enabled.enabled) {
2279  master_th->th.ompt_thread_info.state = omp_state_overhead;
2280  }
2281 #endif
2282 
2283  return TRUE;
2284 }
2285 
2286 #if OMPT_SUPPORT
2287 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2288  kmp_team_t *team) {
2289  // restore state outside the region
2290  thread->th.ompt_thread_info.state =
2291  ((team->t.t_serialized) ? omp_state_work_serial
2292  : omp_state_work_parallel);
2293 }
2294 
2295 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2296  kmp_team_t *team, ompt_data_t *parallel_data,
2297  fork_context_e fork_context, void *codeptr) {
2298  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2299  if (ompt_enabled.ompt_callback_parallel_end) {
2300  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2301  parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2302  codeptr);
2303  }
2304 
2305  task_info->frame.enter_frame = NULL;
2306  __kmp_join_restore_state(thread, team);
2307 }
2308 #endif
2309 
2310 void __kmp_join_call(ident_t *loc, int gtid
2311 #if OMPT_SUPPORT
2312  ,
2313  enum fork_context_e fork_context
2314 #endif
2315 #if OMP_40_ENABLED
2316  ,
2317  int exit_teams
2318 #endif /* OMP_40_ENABLED */
2319  ) {
2320  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2321  kmp_team_t *team;
2322  kmp_team_t *parent_team;
2323  kmp_info_t *master_th;
2324  kmp_root_t *root;
2325  int master_active;
2326  int i;
2327 
2328  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2329 
2330  /* setup current data */
2331  master_th = __kmp_threads[gtid];
2332  root = master_th->th.th_root;
2333  team = master_th->th.th_team;
2334  parent_team = team->t.t_parent;
2335 
2336  master_th->th.th_ident = loc;
2337 
2338 #if OMPT_SUPPORT
2339  if (ompt_enabled.enabled) {
2340  master_th->th.ompt_thread_info.state = omp_state_overhead;
2341  }
2342 #endif
2343 
2344 #if KMP_DEBUG
2345  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2346  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2347  "th_task_team = %p\n",
2348  __kmp_gtid_from_thread(master_th), team,
2349  team->t.t_task_team[master_th->th.th_task_state],
2350  master_th->th.th_task_team));
2351  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2352  team->t.t_task_team[master_th->th.th_task_state]);
2353  }
2354 #endif
2355 
2356  if (team->t.t_serialized) {
2357 #if OMP_40_ENABLED
2358  if (master_th->th.th_teams_microtask) {
2359  // We are in teams construct
2360  int level = team->t.t_level;
2361  int tlevel = master_th->th.th_teams_level;
2362  if (level == tlevel) {
2363  // AC: we haven't incremented it earlier at start of teams construct,
2364  // so do it here - at the end of teams construct
2365  team->t.t_level++;
2366  } else if (level == tlevel + 1) {
2367  // AC: we are exiting parallel inside teams, need to increment
2368  // serialization in order to restore it in the next call to
2369  // __kmpc_end_serialized_parallel
2370  team->t.t_serialized++;
2371  }
2372  }
2373 #endif /* OMP_40_ENABLED */
2374  __kmpc_end_serialized_parallel(loc, gtid);
2375 
2376 #if OMPT_SUPPORT
2377  if (ompt_enabled.enabled) {
2378  __kmp_join_restore_state(master_th, parent_team);
2379  }
2380 #endif
2381 
2382  return;
2383  }
2384 
2385  master_active = team->t.t_master_active;
2386 
2387 #if OMP_40_ENABLED
2388  if (!exit_teams)
2389 #endif /* OMP_40_ENABLED */
2390  {
2391  // AC: No barrier for internal teams at exit from teams construct.
2392  // But there is barrier for external team (league).
2393  __kmp_internal_join(loc, gtid, team);
2394  }
2395 #if OMP_40_ENABLED
2396  else {
2397  master_th->th.th_task_state =
2398  0; // AC: no tasking in teams (out of any parallel)
2399  }
2400 #endif /* OMP_40_ENABLED */
2401 
2402  KMP_MB();
2403 
2404 #if OMPT_SUPPORT
2405  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2406  void *codeptr = team->t.ompt_team_info.master_return_address;
2407 #endif
2408 
2409 #if USE_ITT_BUILD
2410  if (__itt_stack_caller_create_ptr) {
2411  __kmp_itt_stack_caller_destroy(
2412  (__itt_caller)team->t
2413  .t_stack_id); // destroy the stack stitching id after join barrier
2414  }
2415 
2416  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2417  if (team->t.t_active_level == 1
2418 #if OMP_40_ENABLED
2419  && !master_th->th.th_teams_microtask /* not in teams construct */
2420 #endif /* OMP_40_ENABLED */
2421  ) {
2422  master_th->th.th_ident = loc;
2423  // only one notification scheme (either "submit" or "forking/joined", not
2424  // both)
2425  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2426  __kmp_forkjoin_frames_mode == 3)
2427  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2428  master_th->th.th_frame_time, 0, loc,
2429  master_th->th.th_team_nproc, 1);
2430  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2431  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2432  __kmp_itt_region_joined(gtid);
2433  } // active_level == 1
2434 #endif /* USE_ITT_BUILD */
2435 
2436 #if OMP_40_ENABLED
2437  if (master_th->th.th_teams_microtask && !exit_teams &&
2438  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2439  team->t.t_level == master_th->th.th_teams_level + 1) {
2440  // AC: We need to leave the team structure intact at the end of parallel
2441  // inside the teams construct, so that at the next parallel same (hot) team
2442  // works, only adjust nesting levels
2443 
2444  /* Decrement our nested depth level */
2445  team->t.t_level--;
2446  team->t.t_active_level--;
2447  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2448 
2449  /* Restore number of threads in the team if needed */
2450  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2451  int old_num = master_th->th.th_team_nproc;
2452  int new_num = master_th->th.th_teams_size.nth;
2453  kmp_info_t **other_threads = team->t.t_threads;
2454  team->t.t_nproc = new_num;
2455  for (i = 0; i < old_num; ++i) {
2456  other_threads[i]->th.th_team_nproc = new_num;
2457  }
2458  // Adjust states of non-used threads of the team
2459  for (i = old_num; i < new_num; ++i) {
2460  // Re-initialize thread's barrier data.
2461  int b;
2462  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2463  for (b = 0; b < bs_last_barrier; ++b) {
2464  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2465  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2466 #if USE_DEBUGGER
2467  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2468 #endif
2469  }
2470  if (__kmp_tasking_mode != tskm_immediate_exec) {
2471  // Synchronize thread's task state
2472  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2473  }
2474  }
2475  }
2476 
2477 #if OMPT_SUPPORT
2478  if (ompt_enabled.enabled) {
2479  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2480  codeptr);
2481  }
2482 #endif
2483 
2484  return;
2485  }
2486 #endif /* OMP_40_ENABLED */
2487 
2488  /* do cleanup and restore the parent team */
2489  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2490  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2491 
2492  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2493 
2494  /* jc: The following lock has instructions with REL and ACQ semantics,
2495  separating the parallel user code called in this parallel region
2496  from the serial user code called after this function returns. */
2497  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2498 
2499 #if OMP_40_ENABLED
2500  if (!master_th->th.th_teams_microtask ||
2501  team->t.t_level > master_th->th.th_teams_level)
2502 #endif /* OMP_40_ENABLED */
2503  {
2504  /* Decrement our nested depth level */
2505  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2506  }
2507  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2508 
2509 #if OMPT_SUPPORT
2510  if (ompt_enabled.enabled) {
2511  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2512  if (ompt_enabled.ompt_callback_implicit_task) {
2513  int ompt_team_size = team->t.t_nproc;
2514  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2515  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2516  OMPT_CUR_TASK_INFO(master_th)->thread_num);
2517  }
2518 
2519  task_info->frame.exit_frame = NULL;
2520  task_info->task_data = ompt_data_none;
2521  }
2522 #endif
2523 
2524  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2525  master_th, team));
2526  __kmp_pop_current_task_from_thread(master_th);
2527 
2528 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2529  // Restore master thread's partition.
2530  master_th->th.th_first_place = team->t.t_first_place;
2531  master_th->th.th_last_place = team->t.t_last_place;
2532 #endif /* OMP_40_ENABLED */
2533 #if OMP_50_ENABLED
2534  master_th->th.th_def_allocator = team->t.t_def_allocator;
2535 #endif
2536 
2537  updateHWFPControl(team);
2538 
2539  if (root->r.r_active != master_active)
2540  root->r.r_active = master_active;
2541 
2542  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2543  master_th)); // this will free worker threads
2544 
2545  /* this race was fun to find. make sure the following is in the critical
2546  region otherwise assertions may fail occasionally since the old team may be
2547  reallocated and the hierarchy appears inconsistent. it is actually safe to
2548  run and won't cause any bugs, but will cause those assertion failures. it's
2549  only one deref&assign so might as well put this in the critical region */
2550  master_th->th.th_team = parent_team;
2551  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2552  master_th->th.th_team_master = parent_team->t.t_threads[0];
2553  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2554 
2555  /* restore serialized team, if need be */
2556  if (parent_team->t.t_serialized &&
2557  parent_team != master_th->th.th_serial_team &&
2558  parent_team != root->r.r_root_team) {
2559  __kmp_free_team(root,
2560  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2561  master_th->th.th_serial_team = parent_team;
2562  }
2563 
2564  if (__kmp_tasking_mode != tskm_immediate_exec) {
2565  if (master_th->th.th_task_state_top >
2566  0) { // Restore task state from memo stack
2567  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2568  // Remember master's state if we re-use this nested hot team
2569  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2570  master_th->th.th_task_state;
2571  --master_th->th.th_task_state_top; // pop
2572  // Now restore state at this level
2573  master_th->th.th_task_state =
2574  master_th->th
2575  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2576  }
2577  // Copy the task team from the parent team to the master thread
2578  master_th->th.th_task_team =
2579  parent_team->t.t_task_team[master_th->th.th_task_state];
2580  KA_TRACE(20,
2581  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2582  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2583  parent_team));
2584  }
2585 
2586  // TODO: GEH - cannot do this assertion because root thread not set up as
2587  // executing
2588  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2589  master_th->th.th_current_task->td_flags.executing = 1;
2590 
2591  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2592 
2593 #if OMPT_SUPPORT
2594  if (ompt_enabled.enabled) {
2595  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2596  codeptr);
2597  }
2598 #endif
2599 
2600  KMP_MB();
2601  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2602 }
2603 
2604 /* Check whether we should push an internal control record onto the
2605  serial team stack. If so, do it. */
2606 void __kmp_save_internal_controls(kmp_info_t *thread) {
2607 
2608  if (thread->th.th_team != thread->th.th_serial_team) {
2609  return;
2610  }
2611  if (thread->th.th_team->t.t_serialized > 1) {
2612  int push = 0;
2613 
2614  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2615  push = 1;
2616  } else {
2617  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2618  thread->th.th_team->t.t_serialized) {
2619  push = 1;
2620  }
2621  }
2622  if (push) { /* push a record on the serial team's stack */
2623  kmp_internal_control_t *control =
2624  (kmp_internal_control_t *)__kmp_allocate(
2625  sizeof(kmp_internal_control_t));
2626 
2627  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2628 
2629  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2630 
2631  control->next = thread->th.th_team->t.t_control_stack_top;
2632  thread->th.th_team->t.t_control_stack_top = control;
2633  }
2634  }
2635 }
2636 
2637 /* Changes set_nproc */
2638 void __kmp_set_num_threads(int new_nth, int gtid) {
2639  kmp_info_t *thread;
2640  kmp_root_t *root;
2641 
2642  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2643  KMP_DEBUG_ASSERT(__kmp_init_serial);
2644 
2645  if (new_nth < 1)
2646  new_nth = 1;
2647  else if (new_nth > __kmp_max_nth)
2648  new_nth = __kmp_max_nth;
2649 
2650  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2651  thread = __kmp_threads[gtid];
2652 
2653  __kmp_save_internal_controls(thread);
2654 
2655  set__nproc(thread, new_nth);
2656 
2657  // If this omp_set_num_threads() call will cause the hot team size to be
2658  // reduced (in the absence of a num_threads clause), then reduce it now,
2659  // rather than waiting for the next parallel region.
2660  root = thread->th.th_root;
2661  if (__kmp_init_parallel && (!root->r.r_active) &&
2662  (root->r.r_hot_team->t.t_nproc > new_nth)
2663 #if KMP_NESTED_HOT_TEAMS
2664  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2665 #endif
2666  ) {
2667  kmp_team_t *hot_team = root->r.r_hot_team;
2668  int f;
2669 
2670  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2671 
2672  // Release the extra threads we don't need any more.
2673  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2674  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2675  if (__kmp_tasking_mode != tskm_immediate_exec) {
2676  // When decreasing team size, threads no longer in the team should unref
2677  // task team.
2678  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2679  }
2680  __kmp_free_thread(hot_team->t.t_threads[f]);
2681  hot_team->t.t_threads[f] = NULL;
2682  }
2683  hot_team->t.t_nproc = new_nth;
2684 #if KMP_NESTED_HOT_TEAMS
2685  if (thread->th.th_hot_teams) {
2686  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2687  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2688  }
2689 #endif
2690 
2691  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2692 
2693  // Update the t_nproc field in the threads that are still active.
2694  for (f = 0; f < new_nth; f++) {
2695  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2696  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2697  }
2698  // Special flag in case omp_set_num_threads() call
2699  hot_team->t.t_size_changed = -1;
2700  }
2701 }
2702 
2703 /* Changes max_active_levels */
2704 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2705  kmp_info_t *thread;
2706 
2707  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2708  "%d = (%d)\n",
2709  gtid, max_active_levels));
2710  KMP_DEBUG_ASSERT(__kmp_init_serial);
2711 
2712  // validate max_active_levels
2713  if (max_active_levels < 0) {
2714  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2715  // We ignore this call if the user has specified a negative value.
2716  // The current setting won't be changed. The last valid setting will be
2717  // used. A warning will be issued (if warnings are allowed as controlled by
2718  // the KMP_WARNINGS env var).
2719  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2720  "max_active_levels for thread %d = (%d)\n",
2721  gtid, max_active_levels));
2722  return;
2723  }
2724  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2725  // it's OK, the max_active_levels is within the valid range: [ 0;
2726  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2727  // We allow a zero value. (implementation defined behavior)
2728  } else {
2729  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2730  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2731  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2732  // Current upper limit is MAX_INT. (implementation defined behavior)
2733  // If the input exceeds the upper limit, we correct the input to be the
2734  // upper limit. (implementation defined behavior)
2735  // Actually, the flow should never get here until we use MAX_INT limit.
2736  }
2737  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2738  "max_active_levels for thread %d = (%d)\n",
2739  gtid, max_active_levels));
2740 
2741  thread = __kmp_threads[gtid];
2742 
2743  __kmp_save_internal_controls(thread);
2744 
2745  set__max_active_levels(thread, max_active_levels);
2746 }
2747 
2748 /* Gets max_active_levels */
2749 int __kmp_get_max_active_levels(int gtid) {
2750  kmp_info_t *thread;
2751 
2752  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2753  KMP_DEBUG_ASSERT(__kmp_init_serial);
2754 
2755  thread = __kmp_threads[gtid];
2756  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2757  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2758  "curtask_maxaclevel=%d\n",
2759  gtid, thread->th.th_current_task,
2760  thread->th.th_current_task->td_icvs.max_active_levels));
2761  return thread->th.th_current_task->td_icvs.max_active_levels;
2762 }
2763 
2764 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2765 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2766  kmp_info_t *thread;
2767  // kmp_team_t *team;
2768 
2769  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2770  gtid, (int)kind, chunk));
2771  KMP_DEBUG_ASSERT(__kmp_init_serial);
2772 
2773  // Check if the kind parameter is valid, correct if needed.
2774  // Valid parameters should fit in one of two intervals - standard or extended:
2775  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2776  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2777  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2778  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2779  // TODO: Hint needs attention in case we change the default schedule.
2780  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2781  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2782  __kmp_msg_null);
2783  kind = kmp_sched_default;
2784  chunk = 0; // ignore chunk value in case of bad kind
2785  }
2786 
2787  thread = __kmp_threads[gtid];
2788 
2789  __kmp_save_internal_controls(thread);
2790 
2791  if (kind < kmp_sched_upper_std) {
2792  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2793  // differ static chunked vs. unchunked: chunk should be invalid to
2794  // indicate unchunked schedule (which is the default)
2795  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2796  } else {
2797  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2798  __kmp_sch_map[kind - kmp_sched_lower - 1];
2799  }
2800  } else {
2801  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2802  // kmp_sched_lower - 2 ];
2803  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2804  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2805  kmp_sched_lower - 2];
2806  }
2807  if (kind == kmp_sched_auto || chunk < 1) {
2808  // ignore parameter chunk for schedule auto
2809  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2810  } else {
2811  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2812  }
2813 }
2814 
2815 /* Gets def_sched_var ICV values */
2816 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2817  kmp_info_t *thread;
2818  enum sched_type th_type;
2819 
2820  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2821  KMP_DEBUG_ASSERT(__kmp_init_serial);
2822 
2823  thread = __kmp_threads[gtid];
2824 
2825  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2826 
2827  switch (th_type) {
2828  case kmp_sch_static:
2829  case kmp_sch_static_greedy:
2830  case kmp_sch_static_balanced:
2831  *kind = kmp_sched_static;
2832  *chunk = 0; // chunk was not set, try to show this fact via zero value
2833  return;
2834  case kmp_sch_static_chunked:
2835  *kind = kmp_sched_static;
2836  break;
2837  case kmp_sch_dynamic_chunked:
2838  *kind = kmp_sched_dynamic;
2839  break;
2841  case kmp_sch_guided_iterative_chunked:
2842  case kmp_sch_guided_analytical_chunked:
2843  *kind = kmp_sched_guided;
2844  break;
2845  case kmp_sch_auto:
2846  *kind = kmp_sched_auto;
2847  break;
2848  case kmp_sch_trapezoidal:
2849  *kind = kmp_sched_trapezoidal;
2850  break;
2851 #if KMP_STATIC_STEAL_ENABLED
2852  case kmp_sch_static_steal:
2853  *kind = kmp_sched_static_steal;
2854  break;
2855 #endif
2856  default:
2857  KMP_FATAL(UnknownSchedulingType, th_type);
2858  }
2859 
2860  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2861 }
2862 
2863 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2864 
2865  int ii, dd;
2866  kmp_team_t *team;
2867  kmp_info_t *thr;
2868 
2869  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2870  KMP_DEBUG_ASSERT(__kmp_init_serial);
2871 
2872  // validate level
2873  if (level == 0)
2874  return 0;
2875  if (level < 0)
2876  return -1;
2877  thr = __kmp_threads[gtid];
2878  team = thr->th.th_team;
2879  ii = team->t.t_level;
2880  if (level > ii)
2881  return -1;
2882 
2883 #if OMP_40_ENABLED
2884  if (thr->th.th_teams_microtask) {
2885  // AC: we are in teams region where multiple nested teams have same level
2886  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2887  if (level <=
2888  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2889  KMP_DEBUG_ASSERT(ii >= tlevel);
2890  // AC: As we need to pass by the teams league, we need to artificially
2891  // increase ii
2892  if (ii == tlevel) {
2893  ii += 2; // three teams have same level
2894  } else {
2895  ii++; // two teams have same level
2896  }
2897  }
2898  }
2899 #endif
2900 
2901  if (ii == level)
2902  return __kmp_tid_from_gtid(gtid);
2903 
2904  dd = team->t.t_serialized;
2905  level++;
2906  while (ii > level) {
2907  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2908  }
2909  if ((team->t.t_serialized) && (!dd)) {
2910  team = team->t.t_parent;
2911  continue;
2912  }
2913  if (ii > level) {
2914  team = team->t.t_parent;
2915  dd = team->t.t_serialized;
2916  ii--;
2917  }
2918  }
2919 
2920  return (dd > 1) ? (0) : (team->t.t_master_tid);
2921 }
2922 
2923 int __kmp_get_team_size(int gtid, int level) {
2924 
2925  int ii, dd;
2926  kmp_team_t *team;
2927  kmp_info_t *thr;
2928 
2929  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2930  KMP_DEBUG_ASSERT(__kmp_init_serial);
2931 
2932  // validate level
2933  if (level == 0)
2934  return 1;
2935  if (level < 0)
2936  return -1;
2937  thr = __kmp_threads[gtid];
2938  team = thr->th.th_team;
2939  ii = team->t.t_level;
2940  if (level > ii)
2941  return -1;
2942 
2943 #if OMP_40_ENABLED
2944  if (thr->th.th_teams_microtask) {
2945  // AC: we are in teams region where multiple nested teams have same level
2946  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2947  if (level <=
2948  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2949  KMP_DEBUG_ASSERT(ii >= tlevel);
2950  // AC: As we need to pass by the teams league, we need to artificially
2951  // increase ii
2952  if (ii == tlevel) {
2953  ii += 2; // three teams have same level
2954  } else {
2955  ii++; // two teams have same level
2956  }
2957  }
2958  }
2959 #endif
2960 
2961  while (ii > level) {
2962  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2963  }
2964  if (team->t.t_serialized && (!dd)) {
2965  team = team->t.t_parent;
2966  continue;
2967  }
2968  if (ii > level) {
2969  team = team->t.t_parent;
2970  ii--;
2971  }
2972  }
2973 
2974  return team->t.t_nproc;
2975 }
2976 
2977 kmp_r_sched_t __kmp_get_schedule_global() {
2978  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2979  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2980  // independently. So one can get the updated schedule here.
2981 
2982  kmp_r_sched_t r_sched;
2983 
2984  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2985  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2986  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2987  // different roots (even in OMP 2.5)
2988  if (__kmp_sched == kmp_sch_static) {
2989  // replace STATIC with more detailed schedule (balanced or greedy)
2990  r_sched.r_sched_type = __kmp_static;
2991  } else if (__kmp_sched == kmp_sch_guided_chunked) {
2992  // replace GUIDED with more detailed schedule (iterative or analytical)
2993  r_sched.r_sched_type = __kmp_guided;
2994  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2995  r_sched.r_sched_type = __kmp_sched;
2996  }
2997 
2998  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2999  // __kmp_chunk may be wrong here (if it was not ever set)
3000  r_sched.chunk = KMP_DEFAULT_CHUNK;
3001  } else {
3002  r_sched.chunk = __kmp_chunk;
3003  }
3004 
3005  return r_sched;
3006 }
3007 
3008 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3009  at least argc number of *t_argv entries for the requested team. */
3010 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3011 
3012  KMP_DEBUG_ASSERT(team);
3013  if (!realloc || argc > team->t.t_max_argc) {
3014 
3015  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3016  "current entries=%d\n",
3017  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3018  /* if previously allocated heap space for args, free them */
3019  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3020  __kmp_free((void *)team->t.t_argv);
3021 
3022  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3023  /* use unused space in the cache line for arguments */
3024  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3025  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3026  "argv entries\n",
3027  team->t.t_id, team->t.t_max_argc));
3028  team->t.t_argv = &team->t.t_inline_argv[0];
3029  if (__kmp_storage_map) {
3030  __kmp_print_storage_map_gtid(
3031  -1, &team->t.t_inline_argv[0],
3032  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3033  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3034  team->t.t_id);
3035  }
3036  } else {
3037  /* allocate space for arguments in the heap */
3038  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3039  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3040  : 2 * argc;
3041  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3042  "argv entries\n",
3043  team->t.t_id, team->t.t_max_argc));
3044  team->t.t_argv =
3045  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3046  if (__kmp_storage_map) {
3047  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3048  &team->t.t_argv[team->t.t_max_argc],
3049  sizeof(void *) * team->t.t_max_argc,
3050  "team_%d.t_argv", team->t.t_id);
3051  }
3052  }
3053  }
3054 }
3055 
3056 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3057  int i;
3058  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3059  team->t.t_threads =
3060  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3061  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3062  sizeof(dispatch_shared_info_t) * num_disp_buff);
3063  team->t.t_dispatch =
3064  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3065  team->t.t_implicit_task_taskdata =
3066  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3067  team->t.t_max_nproc = max_nth;
3068 
3069  /* setup dispatch buffers */
3070  for (i = 0; i < num_disp_buff; ++i) {
3071  team->t.t_disp_buffer[i].buffer_index = i;
3072 #if OMP_45_ENABLED
3073  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3074 #endif
3075  }
3076 }
3077 
3078 static void __kmp_free_team_arrays(kmp_team_t *team) {
3079  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3080  int i;
3081  for (i = 0; i < team->t.t_max_nproc; ++i) {
3082  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3083  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3084  team->t.t_dispatch[i].th_disp_buffer = NULL;
3085  }
3086  }
3087 #if KMP_USE_HIER_SCHED
3088  __kmp_dispatch_free_hierarchies(team);
3089 #endif
3090  __kmp_free(team->t.t_threads);
3091  __kmp_free(team->t.t_disp_buffer);
3092  __kmp_free(team->t.t_dispatch);
3093  __kmp_free(team->t.t_implicit_task_taskdata);
3094  team->t.t_threads = NULL;
3095  team->t.t_disp_buffer = NULL;
3096  team->t.t_dispatch = NULL;
3097  team->t.t_implicit_task_taskdata = 0;
3098 }
3099 
3100 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3101  kmp_info_t **oldThreads = team->t.t_threads;
3102 
3103  __kmp_free(team->t.t_disp_buffer);
3104  __kmp_free(team->t.t_dispatch);
3105  __kmp_free(team->t.t_implicit_task_taskdata);
3106  __kmp_allocate_team_arrays(team, max_nth);
3107 
3108  KMP_MEMCPY(team->t.t_threads, oldThreads,
3109  team->t.t_nproc * sizeof(kmp_info_t *));
3110 
3111  __kmp_free(oldThreads);
3112 }
3113 
3114 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3115 
3116  kmp_r_sched_t r_sched =
3117  __kmp_get_schedule_global(); // get current state of scheduling globals
3118 
3119 #if OMP_40_ENABLED
3120  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3121 #endif /* OMP_40_ENABLED */
3122 
3123  kmp_internal_control_t g_icvs = {
3124  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3125  (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3126  // for nested parallelism (per thread)
3127  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3128  // adjustment of threads (per thread)
3129  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3130  // whether blocktime is explicitly set
3131  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3132 #if KMP_USE_MONITOR
3133  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3134 // intervals
3135 #endif
3136  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3137  // next parallel region (per thread)
3138  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3139  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3140  // for max_active_levels
3141  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3142 // {sched,chunk} pair
3143 #if OMP_40_ENABLED
3144  __kmp_nested_proc_bind.bind_types[0],
3145  __kmp_default_device,
3146 #endif /* OMP_40_ENABLED */
3147  NULL // struct kmp_internal_control *next;
3148  };
3149 
3150  return g_icvs;
3151 }
3152 
3153 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3154 
3155  kmp_internal_control_t gx_icvs;
3156  gx_icvs.serial_nesting_level =
3157  0; // probably =team->t.t_serial like in save_inter_controls
3158  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3159  gx_icvs.next = NULL;
3160 
3161  return gx_icvs;
3162 }
3163 
3164 static void __kmp_initialize_root(kmp_root_t *root) {
3165  int f;
3166  kmp_team_t *root_team;
3167  kmp_team_t *hot_team;
3168  int hot_team_max_nth;
3169  kmp_r_sched_t r_sched =
3170  __kmp_get_schedule_global(); // get current state of scheduling globals
3171  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3172  KMP_DEBUG_ASSERT(root);
3173  KMP_ASSERT(!root->r.r_begin);
3174 
3175  /* setup the root state structure */
3176  __kmp_init_lock(&root->r.r_begin_lock);
3177  root->r.r_begin = FALSE;
3178  root->r.r_active = FALSE;
3179  root->r.r_in_parallel = 0;
3180  root->r.r_blocktime = __kmp_dflt_blocktime;
3181  root->r.r_nested = __kmp_dflt_nested;
3182  root->r.r_cg_nthreads = 1;
3183 
3184  /* setup the root team for this task */
3185  /* allocate the root team structure */
3186  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3187 
3188  root_team =
3189  __kmp_allocate_team(root,
3190  1, // new_nproc
3191  1, // max_nproc
3192 #if OMPT_SUPPORT
3193  ompt_data_none, // root parallel id
3194 #endif
3195 #if OMP_40_ENABLED
3196  __kmp_nested_proc_bind.bind_types[0],
3197 #endif
3198  &r_icvs,
3199  0 // argc
3200  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3201  );
3202 #if USE_DEBUGGER
3203  // Non-NULL value should be assigned to make the debugger display the root
3204  // team.
3205  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3206 #endif
3207 
3208  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3209 
3210  root->r.r_root_team = root_team;
3211  root_team->t.t_control_stack_top = NULL;
3212 
3213  /* initialize root team */
3214  root_team->t.t_threads[0] = NULL;
3215  root_team->t.t_nproc = 1;
3216  root_team->t.t_serialized = 1;
3217  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3218  root_team->t.t_sched.sched = r_sched.sched;
3219  KA_TRACE(
3220  20,
3221  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3222  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3223 
3224  /* setup the hot team for this task */
3225  /* allocate the hot team structure */
3226  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3227 
3228  hot_team =
3229  __kmp_allocate_team(root,
3230  1, // new_nproc
3231  __kmp_dflt_team_nth_ub * 2, // max_nproc
3232 #if OMPT_SUPPORT
3233  ompt_data_none, // root parallel id
3234 #endif
3235 #if OMP_40_ENABLED
3236  __kmp_nested_proc_bind.bind_types[0],
3237 #endif
3238  &r_icvs,
3239  0 // argc
3240  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3241  );
3242  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3243 
3244  root->r.r_hot_team = hot_team;
3245  root_team->t.t_control_stack_top = NULL;
3246 
3247  /* first-time initialization */
3248  hot_team->t.t_parent = root_team;
3249 
3250  /* initialize hot team */
3251  hot_team_max_nth = hot_team->t.t_max_nproc;
3252  for (f = 0; f < hot_team_max_nth; ++f) {
3253  hot_team->t.t_threads[f] = NULL;
3254  }
3255  hot_team->t.t_nproc = 1;
3256  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3257  hot_team->t.t_sched.sched = r_sched.sched;
3258  hot_team->t.t_size_changed = 0;
3259 }
3260 
3261 #ifdef KMP_DEBUG
3262 
3263 typedef struct kmp_team_list_item {
3264  kmp_team_p const *entry;
3265  struct kmp_team_list_item *next;
3266 } kmp_team_list_item_t;
3267 typedef kmp_team_list_item_t *kmp_team_list_t;
3268 
3269 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3270  kmp_team_list_t list, // List of teams.
3271  kmp_team_p const *team // Team to add.
3272  ) {
3273 
3274  // List must terminate with item where both entry and next are NULL.
3275  // Team is added to the list only once.
3276  // List is sorted in ascending order by team id.
3277  // Team id is *not* a key.
3278 
3279  kmp_team_list_t l;
3280 
3281  KMP_DEBUG_ASSERT(list != NULL);
3282  if (team == NULL) {
3283  return;
3284  }
3285 
3286  __kmp_print_structure_team_accum(list, team->t.t_parent);
3287  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3288 
3289  // Search list for the team.
3290  l = list;
3291  while (l->next != NULL && l->entry != team) {
3292  l = l->next;
3293  }
3294  if (l->next != NULL) {
3295  return; // Team has been added before, exit.
3296  }
3297 
3298  // Team is not found. Search list again for insertion point.
3299  l = list;
3300  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3301  l = l->next;
3302  }
3303 
3304  // Insert team.
3305  {
3306  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3307  sizeof(kmp_team_list_item_t));
3308  *item = *l;
3309  l->entry = team;
3310  l->next = item;
3311  }
3312 }
3313 
3314 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3315 
3316  ) {
3317  __kmp_printf("%s", title);
3318  if (team != NULL) {
3319  __kmp_printf("%2x %p\n", team->t.t_id, team);
3320  } else {
3321  __kmp_printf(" - (nil)\n");
3322  }
3323 }
3324 
3325 static void __kmp_print_structure_thread(char const *title,
3326  kmp_info_p const *thread) {
3327  __kmp_printf("%s", title);
3328  if (thread != NULL) {
3329  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3330  } else {
3331  __kmp_printf(" - (nil)\n");
3332  }
3333 }
3334 
3335 void __kmp_print_structure(void) {
3336 
3337  kmp_team_list_t list;
3338 
3339  // Initialize list of teams.
3340  list =
3341  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3342  list->entry = NULL;
3343  list->next = NULL;
3344 
3345  __kmp_printf("\n------------------------------\nGlobal Thread "
3346  "Table\n------------------------------\n");
3347  {
3348  int gtid;
3349  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3350  __kmp_printf("%2d", gtid);
3351  if (__kmp_threads != NULL) {
3352  __kmp_printf(" %p", __kmp_threads[gtid]);
3353  }
3354  if (__kmp_root != NULL) {
3355  __kmp_printf(" %p", __kmp_root[gtid]);
3356  }
3357  __kmp_printf("\n");
3358  }
3359  }
3360 
3361  // Print out __kmp_threads array.
3362  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3363  "----------\n");
3364  if (__kmp_threads != NULL) {
3365  int gtid;
3366  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3367  kmp_info_t const *thread = __kmp_threads[gtid];
3368  if (thread != NULL) {
3369  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3370  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3371  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3372  __kmp_print_structure_team(" Serial Team: ",
3373  thread->th.th_serial_team);
3374  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3375  __kmp_print_structure_thread(" Master: ",
3376  thread->th.th_team_master);
3377  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3378  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3379 #if OMP_40_ENABLED
3380  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3381 #endif
3382  __kmp_print_structure_thread(" Next in pool: ",
3383  thread->th.th_next_pool);
3384  __kmp_printf("\n");
3385  __kmp_print_structure_team_accum(list, thread->th.th_team);
3386  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3387  }
3388  }
3389  } else {
3390  __kmp_printf("Threads array is not allocated.\n");
3391  }
3392 
3393  // Print out __kmp_root array.
3394  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3395  "--------\n");
3396  if (__kmp_root != NULL) {
3397  int gtid;
3398  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3399  kmp_root_t const *root = __kmp_root[gtid];
3400  if (root != NULL) {
3401  __kmp_printf("GTID %2d %p:\n", gtid, root);
3402  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3403  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3404  __kmp_print_structure_thread(" Uber Thread: ",
3405  root->r.r_uber_thread);
3406  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3407  __kmp_printf(" Nested?: %2d\n", root->r.r_nested);
3408  __kmp_printf(" In Parallel: %2d\n",
3409  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3410  __kmp_printf("\n");
3411  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3412  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3413  }
3414  }
3415  } else {
3416  __kmp_printf("Ubers array is not allocated.\n");
3417  }
3418 
3419  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3420  "--------\n");
3421  while (list->next != NULL) {
3422  kmp_team_p const *team = list->entry;
3423  int i;
3424  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3425  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3426  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3427  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3428  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3429  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3430  for (i = 0; i < team->t.t_nproc; ++i) {
3431  __kmp_printf(" Thread %2d: ", i);
3432  __kmp_print_structure_thread("", team->t.t_threads[i]);
3433  }
3434  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3435  __kmp_printf("\n");
3436  list = list->next;
3437  }
3438 
3439  // Print out __kmp_thread_pool and __kmp_team_pool.
3440  __kmp_printf("\n------------------------------\nPools\n----------------------"
3441  "--------\n");
3442  __kmp_print_structure_thread("Thread pool: ",
3443  CCAST(kmp_info_t *, __kmp_thread_pool));
3444  __kmp_print_structure_team("Team pool: ",
3445  CCAST(kmp_team_t *, __kmp_team_pool));
3446  __kmp_printf("\n");
3447 
3448  // Free team list.
3449  while (list != NULL) {
3450  kmp_team_list_item_t *item = list;
3451  list = list->next;
3452  KMP_INTERNAL_FREE(item);
3453  }
3454 }
3455 
3456 #endif
3457 
3458 //---------------------------------------------------------------------------
3459 // Stuff for per-thread fast random number generator
3460 // Table of primes
3461 static const unsigned __kmp_primes[] = {
3462  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3463  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3464  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3465  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3466  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3467  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3468  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3469  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3470  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3471  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3472  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3473 
3474 //---------------------------------------------------------------------------
3475 // __kmp_get_random: Get a random number using a linear congruential method.
3476 unsigned short __kmp_get_random(kmp_info_t *thread) {
3477  unsigned x = thread->th.th_x;
3478  unsigned short r = x >> 16;
3479 
3480  thread->th.th_x = x * thread->th.th_a + 1;
3481 
3482  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3483  thread->th.th_info.ds.ds_tid, r));
3484 
3485  return r;
3486 }
3487 //--------------------------------------------------------
3488 // __kmp_init_random: Initialize a random number generator
3489 void __kmp_init_random(kmp_info_t *thread) {
3490  unsigned seed = thread->th.th_info.ds.ds_tid;
3491 
3492  thread->th.th_a =
3493  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3494  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3495  KA_TRACE(30,
3496  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3497 }
3498 
3499 #if KMP_OS_WINDOWS
3500 /* reclaim array entries for root threads that are already dead, returns number
3501  * reclaimed */
3502 static int __kmp_reclaim_dead_roots(void) {
3503  int i, r = 0;
3504 
3505  for (i = 0; i < __kmp_threads_capacity; ++i) {
3506  if (KMP_UBER_GTID(i) &&
3507  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3508  !__kmp_root[i]
3509  ->r.r_active) { // AC: reclaim only roots died in non-active state
3510  r += __kmp_unregister_root_other_thread(i);
3511  }
3512  }
3513  return r;
3514 }
3515 #endif
3516 
3517 /* This function attempts to create free entries in __kmp_threads and
3518  __kmp_root, and returns the number of free entries generated.
3519 
3520  For Windows* OS static library, the first mechanism used is to reclaim array
3521  entries for root threads that are already dead.
3522 
3523  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3524  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3525  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3526  threadprivate cache array has been created. Synchronization with
3527  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3528 
3529  After any dead root reclamation, if the clipping value allows array expansion
3530  to result in the generation of a total of nNeed free slots, the function does
3531  that expansion. If not, nothing is done beyond the possible initial root
3532  thread reclamation.
3533 
3534  If any argument is negative, the behavior is undefined. */
3535 static int __kmp_expand_threads(int nNeed) {
3536  int added = 0;
3537  int minimumRequiredCapacity;
3538  int newCapacity;
3539  kmp_info_t **newThreads;
3540  kmp_root_t **newRoot;
3541 
3542 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3543 // resizing __kmp_threads does not need additional protection if foreign
3544 // threads are present
3545 
3546 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3547  /* only for Windows static library */
3548  /* reclaim array entries for root threads that are already dead */
3549  added = __kmp_reclaim_dead_roots();
3550 
3551  if (nNeed) {
3552  nNeed -= added;
3553  if (nNeed < 0)
3554  nNeed = 0;
3555  }
3556 #endif
3557  if (nNeed <= 0)
3558  return added;
3559 
3560  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3561  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3562  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3563  // > __kmp_max_nth in one of two ways:
3564  //
3565  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3566  // may not be resused by another thread, so we may need to increase
3567  // __kmp_threads_capacity to __kmp_max_nth + 1.
3568  //
3569  // 2) New foreign root(s) are encountered. We always register new foreign
3570  // roots. This may cause a smaller # of threads to be allocated at
3571  // subsequent parallel regions, but the worker threads hang around (and
3572  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3573  //
3574  // Anyway, that is the reason for moving the check to see if
3575  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3576  // instead of having it performed here. -BB
3577 
3578  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3579 
3580  /* compute expansion headroom to check if we can expand */
3581  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3582  /* possible expansion too small -- give up */
3583  return added;
3584  }
3585  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3586 
3587  newCapacity = __kmp_threads_capacity;
3588  do {
3589  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3590  : __kmp_sys_max_nth;
3591  } while (newCapacity < minimumRequiredCapacity);
3592  newThreads = (kmp_info_t **)__kmp_allocate(
3593  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3594  newRoot =
3595  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3596  KMP_MEMCPY(newThreads, __kmp_threads,
3597  __kmp_threads_capacity * sizeof(kmp_info_t *));
3598  KMP_MEMCPY(newRoot, __kmp_root,
3599  __kmp_threads_capacity * sizeof(kmp_root_t *));
3600 
3601  kmp_info_t **temp_threads = __kmp_threads;
3602  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3603  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3604  __kmp_free(temp_threads);
3605  added += newCapacity - __kmp_threads_capacity;
3606  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3607 
3608  if (newCapacity > __kmp_tp_capacity) {
3609  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3610  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3611  __kmp_threadprivate_resize_cache(newCapacity);
3612  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3613  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3614  }
3615  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3616  }
3617 
3618  return added;
3619 }
3620 
3621 /* Register the current thread as a root thread and obtain our gtid. We must
3622  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3623  thread that calls from __kmp_do_serial_initialize() */
3624 int __kmp_register_root(int initial_thread) {
3625  kmp_info_t *root_thread;
3626  kmp_root_t *root;
3627  int gtid;
3628  int capacity;
3629  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3630  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3631  KMP_MB();
3632 
3633  /* 2007-03-02:
3634  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3635  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3636  work as expected -- it may return false (that means there is at least one
3637  empty slot in __kmp_threads array), but it is possible the only free slot
3638  is #0, which is reserved for initial thread and so cannot be used for this
3639  one. Following code workarounds this bug.
3640 
3641  However, right solution seems to be not reserving slot #0 for initial
3642  thread because:
3643  (1) there is no magic in slot #0,
3644  (2) we cannot detect initial thread reliably (the first thread which does
3645  serial initialization may be not a real initial thread).
3646  */
3647  capacity = __kmp_threads_capacity;
3648  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3649  --capacity;
3650  }
3651 
3652  /* see if there are too many threads */
3653  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3654  if (__kmp_tp_cached) {
3655  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3656  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3657  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3658  } else {
3659  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3660  __kmp_msg_null);
3661  }
3662  }
3663 
3664  /* find an available thread slot */
3665  /* Don't reassign the zero slot since we need that to only be used by initial
3666  thread */
3667  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3668  gtid++)
3669  ;
3670  KA_TRACE(1,
3671  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3672  KMP_ASSERT(gtid < __kmp_threads_capacity);
3673 
3674  /* update global accounting */
3675  __kmp_all_nth++;
3676  TCW_4(__kmp_nth, __kmp_nth + 1);
3677 
3678  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3679  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3680  if (__kmp_adjust_gtid_mode) {
3681  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3682  if (TCR_4(__kmp_gtid_mode) != 2) {
3683  TCW_4(__kmp_gtid_mode, 2);
3684  }
3685  } else {
3686  if (TCR_4(__kmp_gtid_mode) != 1) {
3687  TCW_4(__kmp_gtid_mode, 1);
3688  }
3689  }
3690  }
3691 
3692 #ifdef KMP_ADJUST_BLOCKTIME
3693  /* Adjust blocktime to zero if necessary */
3694  /* Middle initialization might not have occurred yet */
3695  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3696  if (__kmp_nth > __kmp_avail_proc) {
3697  __kmp_zero_bt = TRUE;
3698  }
3699  }
3700 #endif /* KMP_ADJUST_BLOCKTIME */
3701 
3702  /* setup this new hierarchy */
3703  if (!(root = __kmp_root[gtid])) {
3704  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3705  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3706  }
3707 
3708 #if KMP_STATS_ENABLED
3709  // Initialize stats as soon as possible (right after gtid assignment).
3710  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3711  __kmp_stats_thread_ptr->startLife();
3712  KMP_SET_THREAD_STATE(SERIAL_REGION);
3713  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3714 #endif
3715  __kmp_initialize_root(root);
3716 
3717  /* setup new root thread structure */
3718  if (root->r.r_uber_thread) {
3719  root_thread = root->r.r_uber_thread;
3720  } else {
3721  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3722  if (__kmp_storage_map) {
3723  __kmp_print_thread_storage_map(root_thread, gtid);
3724  }
3725  root_thread->th.th_info.ds.ds_gtid = gtid;
3726 #if OMPT_SUPPORT
3727  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3728 #endif
3729  root_thread->th.th_root = root;
3730  if (__kmp_env_consistency_check) {
3731  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3732  }
3733 #if USE_FAST_MEMORY
3734  __kmp_initialize_fast_memory(root_thread);
3735 #endif /* USE_FAST_MEMORY */
3736 
3737 #if KMP_USE_BGET
3738  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3739  __kmp_initialize_bget(root_thread);
3740 #endif
3741  __kmp_init_random(root_thread); // Initialize random number generator
3742  }
3743 
3744  /* setup the serial team held in reserve by the root thread */
3745  if (!root_thread->th.th_serial_team) {
3746  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3747  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3748  root_thread->th.th_serial_team =
3749  __kmp_allocate_team(root, 1, 1,
3750 #if OMPT_SUPPORT
3751  ompt_data_none, // root parallel id
3752 #endif
3753 #if OMP_40_ENABLED
3754  proc_bind_default,
3755 #endif
3756  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3757  }
3758  KMP_ASSERT(root_thread->th.th_serial_team);
3759  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3760  root_thread->th.th_serial_team));
3761 
3762  /* drop root_thread into place */
3763  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3764 
3765  root->r.r_root_team->t.t_threads[0] = root_thread;
3766  root->r.r_hot_team->t.t_threads[0] = root_thread;
3767  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3768  // AC: the team created in reserve, not for execution (it is unused for now).
3769  root_thread->th.th_serial_team->t.t_serialized = 0;
3770  root->r.r_uber_thread = root_thread;
3771 
3772  /* initialize the thread, get it ready to go */
3773  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3774  TCW_4(__kmp_init_gtid, TRUE);
3775 
3776  /* prepare the master thread for get_gtid() */
3777  __kmp_gtid_set_specific(gtid);
3778 
3779 #if USE_ITT_BUILD
3780  __kmp_itt_thread_name(gtid);
3781 #endif /* USE_ITT_BUILD */
3782 
3783 #ifdef KMP_TDATA_GTID
3784  __kmp_gtid = gtid;
3785 #endif
3786  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3787  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3788 
3789  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3790  "plain=%u\n",
3791  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3792  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3793  KMP_INIT_BARRIER_STATE));
3794  { // Initialize barrier data.
3795  int b;
3796  for (b = 0; b < bs_last_barrier; ++b) {
3797  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3798 #if USE_DEBUGGER
3799  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3800 #endif
3801  }
3802  }
3803  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3804  KMP_INIT_BARRIER_STATE);
3805 
3806 #if KMP_AFFINITY_SUPPORTED
3807 #if OMP_40_ENABLED
3808  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3809  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3810  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3811  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3812 #endif
3813  if (TCR_4(__kmp_init_middle)) {
3814  __kmp_affinity_set_init_mask(gtid, TRUE);
3815  }
3816 #endif /* KMP_AFFINITY_SUPPORTED */
3817 #if OMP_50_ENABLED
3818  root_thread->th.th_def_allocator = __kmp_def_allocator;
3819 #endif
3820 
3821  __kmp_root_counter++;
3822 
3823 #if OMPT_SUPPORT
3824  if (!initial_thread && ompt_enabled.enabled) {
3825 
3826  kmp_info_t *root_thread = ompt_get_thread();
3827 
3828  ompt_set_thread_state(root_thread, omp_state_overhead);
3829 
3830  if (ompt_enabled.ompt_callback_thread_begin) {
3831  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3832  ompt_thread_initial, __ompt_get_thread_data_internal());
3833  }
3834  ompt_data_t *task_data;
3835  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3836  if (ompt_enabled.ompt_callback_task_create) {
3837  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3838  NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3839  // initial task has nothing to return to
3840  }
3841 
3842  ompt_set_thread_state(root_thread, omp_state_work_serial);
3843  }
3844 #endif
3845 
3846  KMP_MB();
3847  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3848 
3849  return gtid;
3850 }
3851 
3852 #if KMP_NESTED_HOT_TEAMS
3853 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3854  const int max_level) {
3855  int i, n, nth;
3856  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3857  if (!hot_teams || !hot_teams[level].hot_team) {
3858  return 0;
3859  }
3860  KMP_DEBUG_ASSERT(level < max_level);
3861  kmp_team_t *team = hot_teams[level].hot_team;
3862  nth = hot_teams[level].hot_team_nth;
3863  n = nth - 1; // master is not freed
3864  if (level < max_level - 1) {
3865  for (i = 0; i < nth; ++i) {
3866  kmp_info_t *th = team->t.t_threads[i];
3867  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3868  if (i > 0 && th->th.th_hot_teams) {
3869  __kmp_free(th->th.th_hot_teams);
3870  th->th.th_hot_teams = NULL;
3871  }
3872  }
3873  }
3874  __kmp_free_team(root, team, NULL);
3875  return n;
3876 }
3877 #endif
3878 
3879 // Resets a root thread and clear its root and hot teams.
3880 // Returns the number of __kmp_threads entries directly and indirectly freed.
3881 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3882  kmp_team_t *root_team = root->r.r_root_team;
3883  kmp_team_t *hot_team = root->r.r_hot_team;
3884  int n = hot_team->t.t_nproc;
3885  int i;
3886 
3887  KMP_DEBUG_ASSERT(!root->r.r_active);
3888 
3889  root->r.r_root_team = NULL;
3890  root->r.r_hot_team = NULL;
3891  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3892  // before call to __kmp_free_team().
3893  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3894 #if KMP_NESTED_HOT_TEAMS
3895  if (__kmp_hot_teams_max_level >
3896  0) { // need to free nested hot teams and their threads if any
3897  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3898  kmp_info_t *th = hot_team->t.t_threads[i];
3899  if (__kmp_hot_teams_max_level > 1) {
3900  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3901  }
3902  if (th->th.th_hot_teams) {
3903  __kmp_free(th->th.th_hot_teams);
3904  th->th.th_hot_teams = NULL;
3905  }
3906  }
3907  }
3908 #endif
3909  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3910 
3911  // Before we can reap the thread, we need to make certain that all other
3912  // threads in the teams that had this root as ancestor have stopped trying to
3913  // steal tasks.
3914  if (__kmp_tasking_mode != tskm_immediate_exec) {
3915  __kmp_wait_to_unref_task_teams();
3916  }
3917 
3918 #if KMP_OS_WINDOWS
3919  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3920  KA_TRACE(
3921  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3922  "\n",
3923  (LPVOID) & (root->r.r_uber_thread->th),
3924  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3925  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3926 #endif /* KMP_OS_WINDOWS */
3927 
3928 #if OMPT_SUPPORT
3929  if (ompt_enabled.ompt_callback_thread_end) {
3930  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3931  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3932  }
3933 #endif
3934 
3935  TCW_4(__kmp_nth,
3936  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3937  root->r.r_cg_nthreads--;
3938 
3939  __kmp_reap_thread(root->r.r_uber_thread, 1);
3940 
3941  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3942  // of freeing.
3943  root->r.r_uber_thread = NULL;
3944  /* mark root as no longer in use */
3945  root->r.r_begin = FALSE;
3946 
3947  return n;
3948 }
3949 
3950 void __kmp_unregister_root_current_thread(int gtid) {
3951  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3952  /* this lock should be ok, since unregister_root_current_thread is never
3953  called during an abort, only during a normal close. furthermore, if you
3954  have the forkjoin lock, you should never try to get the initz lock */
3955  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3956  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3957  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3958  "exiting T#%d\n",
3959  gtid));
3960  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3961  return;
3962  }
3963  kmp_root_t *root = __kmp_root[gtid];
3964 
3965  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3966  KMP_ASSERT(KMP_UBER_GTID(gtid));
3967  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3968  KMP_ASSERT(root->r.r_active == FALSE);
3969 
3970  KMP_MB();
3971 
3972 #if OMP_45_ENABLED
3973  kmp_info_t *thread = __kmp_threads[gtid];
3974  kmp_team_t *team = thread->th.th_team;
3975  kmp_task_team_t *task_team = thread->th.th_task_team;
3976 
3977  // we need to wait for the proxy tasks before finishing the thread
3978  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3979 #if OMPT_SUPPORT
3980  // the runtime is shutting down so we won't report any events
3981  thread->th.ompt_thread_info.state = omp_state_undefined;
3982 #endif
3983  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3984  }
3985 #endif
3986 
3987  __kmp_reset_root(gtid, root);
3988 
3989  /* free up this thread slot */
3990  __kmp_gtid_set_specific(KMP_GTID_DNE);
3991 #ifdef KMP_TDATA_GTID
3992  __kmp_gtid = KMP_GTID_DNE;
3993 #endif
3994 
3995  KMP_MB();
3996  KC_TRACE(10,
3997  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3998 
3999  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4000 }
4001 
4002 #if KMP_OS_WINDOWS
4003 /* __kmp_forkjoin_lock must be already held
4004  Unregisters a root thread that is not the current thread. Returns the number
4005  of __kmp_threads entries freed as a result. */
4006 static int __kmp_unregister_root_other_thread(int gtid) {
4007  kmp_root_t *root = __kmp_root[gtid];
4008  int r;
4009 
4010  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4011  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4012  KMP_ASSERT(KMP_UBER_GTID(gtid));
4013  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4014  KMP_ASSERT(root->r.r_active == FALSE);
4015 
4016  r = __kmp_reset_root(gtid, root);
4017  KC_TRACE(10,
4018  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4019  return r;
4020 }
4021 #endif
4022 
4023 #if KMP_DEBUG
4024 void __kmp_task_info() {
4025 
4026  kmp_int32 gtid = __kmp_entry_gtid();
4027  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4028  kmp_info_t *this_thr = __kmp_threads[gtid];
4029  kmp_team_t *steam = this_thr->th.th_serial_team;
4030  kmp_team_t *team = this_thr->th.th_team;
4031 
4032  __kmp_printf(
4033  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4034  "ptask=%p\n",
4035  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4036  team->t.t_implicit_task_taskdata[tid].td_parent);
4037 }
4038 #endif // KMP_DEBUG
4039 
4040 /* TODO optimize with one big memclr, take out what isn't needed, split
4041  responsibility to workers as much as possible, and delay initialization of
4042  features as much as possible */
4043 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4044  int tid, int gtid) {
4045  /* this_thr->th.th_info.ds.ds_gtid is setup in
4046  kmp_allocate_thread/create_worker.
4047  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4048  kmp_info_t *master = team->t.t_threads[0];
4049  KMP_DEBUG_ASSERT(this_thr != NULL);
4050  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4051  KMP_DEBUG_ASSERT(team);
4052  KMP_DEBUG_ASSERT(team->t.t_threads);
4053  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4054  KMP_DEBUG_ASSERT(master);
4055  KMP_DEBUG_ASSERT(master->th.th_root);
4056 
4057  KMP_MB();
4058 
4059  TCW_SYNC_PTR(this_thr->th.th_team, team);
4060 
4061  this_thr->th.th_info.ds.ds_tid = tid;
4062  this_thr->th.th_set_nproc = 0;
4063  if (__kmp_tasking_mode != tskm_immediate_exec)
4064  // When tasking is possible, threads are not safe to reap until they are
4065  // done tasking; this will be set when tasking code is exited in wait
4066  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4067  else // no tasking --> always safe to reap
4068  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4069 #if OMP_40_ENABLED
4070  this_thr->th.th_set_proc_bind = proc_bind_default;
4071 #if KMP_AFFINITY_SUPPORTED
4072  this_thr->th.th_new_place = this_thr->th.th_current_place;
4073 #endif
4074 #endif
4075  this_thr->th.th_root = master->th.th_root;
4076 
4077  /* setup the thread's cache of the team structure */
4078  this_thr->th.th_team_nproc = team->t.t_nproc;
4079  this_thr->th.th_team_master = master;
4080  this_thr->th.th_team_serialized = team->t.t_serialized;
4081  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4082 
4083  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4084 
4085  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4086  tid, gtid, this_thr, this_thr->th.th_current_task));
4087 
4088  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4089  team, tid, TRUE);
4090 
4091  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4092  tid, gtid, this_thr, this_thr->th.th_current_task));
4093  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4094  // __kmp_initialize_team()?
4095 
4096  /* TODO no worksharing in speculative threads */
4097  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4098 
4099  this_thr->th.th_local.this_construct = 0;
4100 
4101  if (!this_thr->th.th_pri_common) {
4102  this_thr->th.th_pri_common =
4103  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4104  if (__kmp_storage_map) {
4105  __kmp_print_storage_map_gtid(
4106  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4107  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4108  }
4109  this_thr->th.th_pri_head = NULL;
4110  }
4111 
4112  /* Initialize dynamic dispatch */
4113  {
4114  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4115  // Use team max_nproc since this will never change for the team.
4116  size_t disp_size =
4117  sizeof(dispatch_private_info_t) *
4118  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4119  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4120  team->t.t_max_nproc));
4121  KMP_ASSERT(dispatch);
4122  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4123  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4124 
4125  dispatch->th_disp_index = 0;
4126 #if OMP_45_ENABLED
4127  dispatch->th_doacross_buf_idx = 0;
4128 #endif
4129  if (!dispatch->th_disp_buffer) {
4130  dispatch->th_disp_buffer =
4131  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4132 
4133  if (__kmp_storage_map) {
4134  __kmp_print_storage_map_gtid(
4135  gtid, &dispatch->th_disp_buffer[0],
4136  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4137  ? 1
4138  : __kmp_dispatch_num_buffers],
4139  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4140  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4141  gtid, team->t.t_id, gtid);
4142  }
4143  } else {
4144  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4145  }
4146 
4147  dispatch->th_dispatch_pr_current = 0;
4148  dispatch->th_dispatch_sh_current = 0;
4149 
4150  dispatch->th_deo_fcn = 0; /* ORDERED */
4151  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4152  }
4153 
4154  this_thr->th.th_next_pool = NULL;
4155 
4156  if (!this_thr->th.th_task_state_memo_stack) {
4157  size_t i;
4158  this_thr->th.th_task_state_memo_stack =
4159  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4160  this_thr->th.th_task_state_top = 0;
4161  this_thr->th.th_task_state_stack_sz = 4;
4162  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4163  ++i) // zero init the stack
4164  this_thr->th.th_task_state_memo_stack[i] = 0;
4165  }
4166 
4167  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4168  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4169 
4170  KMP_MB();
4171 }
4172 
4173 /* allocate a new thread for the requesting team. this is only called from
4174  within a forkjoin critical section. we will first try to get an available
4175  thread from the thread pool. if none is available, we will fork a new one
4176  assuming we are able to create a new one. this should be assured, as the
4177  caller should check on this first. */
4178 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4179  int new_tid) {
4180  kmp_team_t *serial_team;
4181  kmp_info_t *new_thr;
4182  int new_gtid;
4183 
4184  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4185  KMP_DEBUG_ASSERT(root && team);
4186 #if !KMP_NESTED_HOT_TEAMS
4187  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4188 #endif
4189  KMP_MB();
4190 
4191  /* first, try to get one from the thread pool */
4192  if (__kmp_thread_pool) {
4193 
4194  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4195  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4196  if (new_thr == __kmp_thread_pool_insert_pt) {
4197  __kmp_thread_pool_insert_pt = NULL;
4198  }
4199  TCW_4(new_thr->th.th_in_pool, FALSE);
4200  // Don't touch th_active_in_pool or th_active.
4201  // The worker thread adjusts those flags as it sleeps/awakens.
4202  __kmp_thread_pool_nth--;
4203 
4204  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4205  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4206  KMP_ASSERT(!new_thr->th.th_team);
4207  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4208  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4209 
4210  /* setup the thread structure */
4211  __kmp_initialize_info(new_thr, team, new_tid,
4212  new_thr->th.th_info.ds.ds_gtid);
4213  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4214 
4215  TCW_4(__kmp_nth, __kmp_nth + 1);
4216  root->r.r_cg_nthreads++;
4217 
4218  new_thr->th.th_task_state = 0;
4219  new_thr->th.th_task_state_top = 0;
4220  new_thr->th.th_task_state_stack_sz = 4;
4221 
4222 #ifdef KMP_ADJUST_BLOCKTIME
4223  /* Adjust blocktime back to zero if necessary */
4224  /* Middle initialization might not have occurred yet */
4225  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4226  if (__kmp_nth > __kmp_avail_proc) {
4227  __kmp_zero_bt = TRUE;
4228  }
4229  }
4230 #endif /* KMP_ADJUST_BLOCKTIME */
4231 
4232 #if KMP_DEBUG
4233  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4234  // KMP_BARRIER_PARENT_FLAG.
4235  int b;
4236  kmp_balign_t *balign = new_thr->th.th_bar;
4237  for (b = 0; b < bs_last_barrier; ++b)
4238  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4239 #endif
4240 
4241  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4242  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4243 
4244  KMP_MB();
4245  return new_thr;
4246  }
4247 
4248  /* no, well fork a new one */
4249  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4250  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4251 
4252 #if KMP_USE_MONITOR
4253  // If this is the first worker thread the RTL is creating, then also
4254  // launch the monitor thread. We try to do this as early as possible.
4255  if (!TCR_4(__kmp_init_monitor)) {
4256  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4257  if (!TCR_4(__kmp_init_monitor)) {
4258  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4259  TCW_4(__kmp_init_monitor, 1);
4260  __kmp_create_monitor(&__kmp_monitor);
4261  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4262 #if KMP_OS_WINDOWS
4263  // AC: wait until monitor has started. This is a fix for CQ232808.
4264  // The reason is that if the library is loaded/unloaded in a loop with
4265  // small (parallel) work in between, then there is high probability that
4266  // monitor thread started after the library shutdown. At shutdown it is
4267  // too late to cope with the problem, because when the master is in
4268  // DllMain (process detach) the monitor has no chances to start (it is
4269  // blocked), and master has no means to inform the monitor that the
4270  // library has gone, because all the memory which the monitor can access
4271  // is going to be released/reset.
4272  while (TCR_4(__kmp_init_monitor) < 2) {
4273  KMP_YIELD(TRUE);
4274  }
4275  KF_TRACE(10, ("after monitor thread has started\n"));
4276 #endif
4277  }
4278  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4279  }
4280 #endif
4281 
4282  KMP_MB();
4283  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4284  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4285  }
4286 
4287  /* allocate space for it. */
4288  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4289 
4290  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4291 
4292  if (__kmp_storage_map) {
4293  __kmp_print_thread_storage_map(new_thr, new_gtid);
4294  }
4295 
4296  // add the reserve serialized team, initialized from the team's master thread
4297  {
4298  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4299  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4300  new_thr->th.th_serial_team = serial_team =
4301  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4302 #if OMPT_SUPPORT
4303  ompt_data_none, // root parallel id
4304 #endif
4305 #if OMP_40_ENABLED
4306  proc_bind_default,
4307 #endif
4308  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4309  }
4310  KMP_ASSERT(serial_team);
4311  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4312  // execution (it is unused for now).
4313  serial_team->t.t_threads[0] = new_thr;
4314  KF_TRACE(10,
4315  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4316  new_thr));
4317 
4318  /* setup the thread structures */
4319  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4320 
4321 #if USE_FAST_MEMORY
4322  __kmp_initialize_fast_memory(new_thr);
4323 #endif /* USE_FAST_MEMORY */
4324 
4325 #if KMP_USE_BGET
4326  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4327  __kmp_initialize_bget(new_thr);
4328 #endif
4329 
4330  __kmp_init_random(new_thr); // Initialize random number generator
4331 
4332  /* Initialize these only once when thread is grabbed for a team allocation */
4333  KA_TRACE(20,
4334  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4335  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4336 
4337  int b;
4338  kmp_balign_t *balign = new_thr->th.th_bar;
4339  for (b = 0; b < bs_last_barrier; ++b) {
4340  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4341  balign[b].bb.team = NULL;
4342  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4343  balign[b].bb.use_oncore_barrier = 0;
4344  }
4345 
4346  new_thr->th.th_spin_here = FALSE;
4347  new_thr->th.th_next_waiting = 0;
4348 #if KMP_OS_UNIX
4349  new_thr->th.th_blocking = false;
4350 #endif
4351 
4352 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4353  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4354  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4355  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4356  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4357 #endif
4358 #if OMP_50_ENABLED
4359  new_thr->th.th_def_allocator = __kmp_def_allocator;
4360 #endif
4361 
4362  TCW_4(new_thr->th.th_in_pool, FALSE);
4363  new_thr->th.th_active_in_pool = FALSE;
4364  TCW_4(new_thr->th.th_active, TRUE);
4365 
4366  /* adjust the global counters */
4367  __kmp_all_nth++;
4368  __kmp_nth++;
4369 
4370  root->r.r_cg_nthreads++;
4371 
4372  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4373  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4374  if (__kmp_adjust_gtid_mode) {
4375  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4376  if (TCR_4(__kmp_gtid_mode) != 2) {
4377  TCW_4(__kmp_gtid_mode, 2);
4378  }
4379  } else {
4380  if (TCR_4(__kmp_gtid_mode) != 1) {
4381  TCW_4(__kmp_gtid_mode, 1);
4382  }
4383  }
4384  }
4385 
4386 #ifdef KMP_ADJUST_BLOCKTIME
4387  /* Adjust blocktime back to zero if necessary */
4388  /* Middle initialization might not have occurred yet */
4389  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4390  if (__kmp_nth > __kmp_avail_proc) {
4391  __kmp_zero_bt = TRUE;
4392  }
4393  }
4394 #endif /* KMP_ADJUST_BLOCKTIME */
4395 
4396  /* actually fork it and create the new worker thread */
4397  KF_TRACE(
4398  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4399  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4400  KF_TRACE(10,
4401  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4402 
4403  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4404  new_gtid));
4405  KMP_MB();
4406  return new_thr;
4407 }
4408 
4409 /* Reinitialize team for reuse.
4410  The hot team code calls this case at every fork barrier, so EPCC barrier
4411  test are extremely sensitive to changes in it, esp. writes to the team
4412  struct, which cause a cache invalidation in all threads.
4413  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4414 static void __kmp_reinitialize_team(kmp_team_t *team,
4415  kmp_internal_control_t *new_icvs,
4416  ident_t *loc) {
4417  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4418  team->t.t_threads[0], team));
4419  KMP_DEBUG_ASSERT(team && new_icvs);
4420  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4421  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4422 
4423  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4424  // Copy ICVs to the master thread's implicit taskdata
4425  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4426  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4427 
4428  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4429  team->t.t_threads[0], team));
4430 }
4431 
4432 /* Initialize the team data structure.
4433  This assumes the t_threads and t_max_nproc are already set.
4434  Also, we don't touch the arguments */
4435 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4436  kmp_internal_control_t *new_icvs,
4437  ident_t *loc) {
4438  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4439 
4440  /* verify */
4441  KMP_DEBUG_ASSERT(team);
4442  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4443  KMP_DEBUG_ASSERT(team->t.t_threads);
4444  KMP_MB();
4445 
4446  team->t.t_master_tid = 0; /* not needed */
4447  /* team->t.t_master_bar; not needed */
4448  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4449  team->t.t_nproc = new_nproc;
4450 
4451  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4452  team->t.t_next_pool = NULL;
4453  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4454  * up hot team */
4455 
4456  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4457  team->t.t_invoke = NULL; /* not needed */
4458 
4459  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4460  team->t.t_sched.sched = new_icvs->sched.sched;
4461 
4462 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4463  team->t.t_fp_control_saved = FALSE; /* not needed */
4464  team->t.t_x87_fpu_control_word = 0; /* not needed */
4465  team->t.t_mxcsr = 0; /* not needed */
4466 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4467 
4468  team->t.t_construct = 0;
4469 
4470  team->t.t_ordered.dt.t_value = 0;
4471  team->t.t_master_active = FALSE;
4472 
4473  memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4474 
4475 #ifdef KMP_DEBUG
4476  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4477 #endif
4478 #if KMP_OS_WINDOWS
4479  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4480 #endif
4481 
4482  team->t.t_control_stack_top = NULL;
4483 
4484  __kmp_reinitialize_team(team, new_icvs, loc);
4485 
4486  KMP_MB();
4487  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4488 }
4489 
4490 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4491 /* Sets full mask for thread and returns old mask, no changes to structures. */
4492 static void
4493 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4494  if (KMP_AFFINITY_CAPABLE()) {
4495  int status;
4496  if (old_mask != NULL) {
4497  status = __kmp_get_system_affinity(old_mask, TRUE);
4498  int error = errno;
4499  if (status != 0) {
4500  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4501  __kmp_msg_null);
4502  }
4503  }
4504  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4505  }
4506 }
4507 #endif
4508 
4509 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4510 
4511 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4512 // It calculats the worker + master thread's partition based upon the parent
4513 // thread's partition, and binds each worker to a thread in their partition.
4514 // The master thread's partition should already include its current binding.
4515 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4516  // Copy the master thread's place partion to the team struct
4517  kmp_info_t *master_th = team->t.t_threads[0];
4518  KMP_DEBUG_ASSERT(master_th != NULL);
4519  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4520  int first_place = master_th->th.th_first_place;
4521  int last_place = master_th->th.th_last_place;
4522  int masters_place = master_th->th.th_current_place;
4523  team->t.t_first_place = first_place;
4524  team->t.t_last_place = last_place;
4525 
4526  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4527  "bound to place %d partition = [%d,%d]\n",
4528  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4529  team->t.t_id, masters_place, first_place, last_place));
4530 
4531  switch (proc_bind) {
4532 
4533  case proc_bind_default:
4534  // serial teams might have the proc_bind policy set to proc_bind_default. It
4535  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4536  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4537  break;
4538 
4539  case proc_bind_master: {
4540  int f;
4541  int n_th = team->t.t_nproc;
4542  for (f = 1; f < n_th; f++) {
4543  kmp_info_t *th = team->t.t_threads[f];
4544  KMP_DEBUG_ASSERT(th != NULL);
4545  th->th.th_first_place = first_place;
4546  th->th.th_last_place = last_place;
4547  th->th.th_new_place = masters_place;
4548 
4549  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4550  "partition = [%d,%d]\n",
4551  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4552  f, masters_place, first_place, last_place));
4553  }
4554  } break;
4555 
4556  case proc_bind_close: {
4557  int f;
4558  int n_th = team->t.t_nproc;
4559  int n_places;
4560  if (first_place <= last_place) {
4561  n_places = last_place - first_place + 1;
4562  } else {
4563  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4564  }
4565  if (n_th <= n_places) {
4566  int place = masters_place;
4567  for (f = 1; f < n_th; f++) {
4568  kmp_info_t *th = team->t.t_threads[f];
4569  KMP_DEBUG_ASSERT(th != NULL);
4570 
4571  if (place == last_place) {
4572  place = first_place;
4573  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4574  place = 0;
4575  } else {
4576  place++;
4577  }
4578  th->th.th_first_place = first_place;
4579  th->th.th_last_place = last_place;
4580  th->th.th_new_place = place;
4581 
4582  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4583  "partition = [%d,%d]\n",
4584  __kmp_gtid_from_thread(team->t.t_threads[f]),
4585  team->t.t_id, f, place, first_place, last_place));
4586  }
4587  } else {
4588  int S, rem, gap, s_count;
4589  S = n_th / n_places;
4590  s_count = 0;
4591  rem = n_th - (S * n_places);
4592  gap = rem > 0 ? n_places / rem : n_places;
4593  int place = masters_place;
4594  int gap_ct = gap;
4595  for (f = 0; f < n_th; f++) {
4596  kmp_info_t *th = team->t.t_threads[f];
4597  KMP_DEBUG_ASSERT(th != NULL);
4598 
4599  th->th.th_first_place = first_place;
4600  th->th.th_last_place = last_place;
4601  th->th.th_new_place = place;
4602  s_count++;
4603 
4604  if ((s_count == S) && rem && (gap_ct == gap)) {
4605  // do nothing, add an extra thread to place on next iteration
4606  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4607  // we added an extra thread to this place; move to next place
4608  if (place == last_place) {
4609  place = first_place;
4610  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4611  place = 0;
4612  } else {
4613  place++;
4614  }
4615  s_count = 0;
4616  gap_ct = 1;
4617  rem--;
4618  } else if (s_count == S) { // place full; don't add extra
4619  if (place == last_place) {
4620  place = first_place;
4621  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4622  place = 0;
4623  } else {
4624  place++;
4625  }
4626  gap_ct++;
4627  s_count = 0;
4628  }
4629 
4630  KA_TRACE(100,
4631  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4632  "partition = [%d,%d]\n",
4633  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4634  th->th.th_new_place, first_place, last_place));
4635  }
4636  KMP_DEBUG_ASSERT(place == masters_place);
4637  }
4638  } break;
4639 
4640  case proc_bind_spread: {
4641  int f;
4642  int n_th = team->t.t_nproc;
4643  int n_places;
4644  int thidx;
4645  if (first_place <= last_place) {
4646  n_places = last_place - first_place + 1;
4647  } else {
4648  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4649  }
4650  if (n_th <= n_places) {
4651  int place = -1;
4652 
4653  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4654  int S = n_places / n_th;
4655  int s_count, rem, gap, gap_ct;
4656 
4657  place = masters_place;
4658  rem = n_places - n_th * S;
4659  gap = rem ? n_th / rem : 1;
4660  gap_ct = gap;
4661  thidx = n_th;
4662  if (update_master_only == 1)
4663  thidx = 1;
4664  for (f = 0; f < thidx; f++) {
4665  kmp_info_t *th = team->t.t_threads[f];
4666  KMP_DEBUG_ASSERT(th != NULL);
4667 
4668  th->th.th_first_place = place;
4669  th->th.th_new_place = place;
4670  s_count = 1;
4671  while (s_count < S) {
4672  if (place == last_place) {
4673  place = first_place;
4674  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4675  place = 0;
4676  } else {
4677  place++;
4678  }
4679  s_count++;
4680  }
4681  if (rem && (gap_ct == gap)) {
4682  if (place == last_place) {
4683  place = first_place;
4684  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4685  place = 0;
4686  } else {
4687  place++;
4688  }
4689  rem--;
4690  gap_ct = 0;
4691  }
4692  th->th.th_last_place = place;
4693  gap_ct++;
4694 
4695  if (place == last_place) {
4696  place = first_place;
4697  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4698  place = 0;
4699  } else {
4700  place++;
4701  }
4702 
4703  KA_TRACE(100,
4704  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4705  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4706  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4707  f, th->th.th_new_place, th->th.th_first_place,
4708  th->th.th_last_place, __kmp_affinity_num_masks));
4709  }
4710  } else {
4711  /* Having uniform space of available computation places I can create
4712  T partitions of round(P/T) size and put threads into the first
4713  place of each partition. */
4714  double current = static_cast<double>(masters_place);
4715  double spacing =
4716  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4717  int first, last;
4718  kmp_info_t *th;
4719 
4720  thidx = n_th + 1;
4721  if (update_master_only == 1)
4722  thidx = 1;
4723  for (f = 0; f < thidx; f++) {
4724  first = static_cast<int>(current);
4725  last = static_cast<int>(current + spacing) - 1;
4726  KMP_DEBUG_ASSERT(last >= first);
4727  if (first >= n_places) {
4728  if (masters_place) {
4729  first -= n_places;
4730  last -= n_places;
4731  if (first == (masters_place + 1)) {
4732  KMP_DEBUG_ASSERT(f == n_th);
4733  first--;
4734  }
4735  if (last == masters_place) {
4736  KMP_DEBUG_ASSERT(f == (n_th - 1));
4737  last--;
4738  }
4739  } else {
4740  KMP_DEBUG_ASSERT(f == n_th);
4741  first = 0;
4742  last = 0;
4743  }
4744  }
4745  if (last >= n_places) {
4746  last = (n_places - 1);
4747  }
4748  place = first;
4749  current += spacing;
4750  if (f < n_th) {
4751  KMP_DEBUG_ASSERT(0 <= first);
4752  KMP_DEBUG_ASSERT(n_places > first);
4753  KMP_DEBUG_ASSERT(0 <= last);
4754  KMP_DEBUG_ASSERT(n_places > last);
4755  KMP_DEBUG_ASSERT(last_place >= first_place);
4756  th = team->t.t_threads[f];
4757  KMP_DEBUG_ASSERT(th);
4758  th->th.th_first_place = first;
4759  th->th.th_new_place = place;
4760  th->th.th_last_place = last;
4761 
4762  KA_TRACE(100,
4763  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4764  "partition = [%d,%d], spacing = %.4f\n",
4765  __kmp_gtid_from_thread(team->t.t_threads[f]),
4766  team->t.t_id, f, th->th.th_new_place,
4767  th->th.th_first_place, th->th.th_last_place, spacing));
4768  }
4769  }
4770  }
4771  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4772  } else {
4773  int S, rem, gap, s_count;
4774  S = n_th / n_places;
4775  s_count = 0;
4776  rem = n_th - (S * n_places);
4777  gap = rem > 0 ? n_places / rem : n_places;
4778  int place = masters_place;
4779  int gap_ct = gap;
4780  thidx = n_th;
4781  if (update_master_only == 1)
4782  thidx = 1;
4783  for (f = 0; f < thidx; f++) {
4784  kmp_info_t *th = team->t.t_threads[f];
4785  KMP_DEBUG_ASSERT(th != NULL);
4786 
4787  th->th.th_first_place = place;
4788  th->th.th_last_place = place;
4789  th->th.th_new_place = place;
4790  s_count++;
4791 
4792  if ((s_count == S) && rem && (gap_ct == gap)) {
4793  // do nothing, add an extra thread to place on next iteration
4794  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4795  // we added an extra thread to this place; move on to next place
4796  if (place == last_place) {
4797  place = first_place;
4798  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4799  place = 0;
4800  } else {
4801  place++;
4802  }
4803  s_count = 0;
4804  gap_ct = 1;
4805  rem--;
4806  } else if (s_count == S) { // place is full; don't add extra thread
4807  if (place == last_place) {
4808  place = first_place;
4809  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4810  place = 0;
4811  } else {
4812  place++;
4813  }
4814  gap_ct++;
4815  s_count = 0;
4816  }
4817 
4818  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4819  "partition = [%d,%d]\n",
4820  __kmp_gtid_from_thread(team->t.t_threads[f]),
4821  team->t.t_id, f, th->th.th_new_place,
4822  th->th.th_first_place, th->th.th_last_place));
4823  }
4824  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4825  }
4826  } break;
4827 
4828  default:
4829  break;
4830  }
4831 
4832  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4833 }
4834 
4835 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4836 
4837 /* allocate a new team data structure to use. take one off of the free pool if
4838  available */
4839 kmp_team_t *
4840 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4841 #if OMPT_SUPPORT
4842  ompt_data_t ompt_parallel_data,
4843 #endif
4844 #if OMP_40_ENABLED
4845  kmp_proc_bind_t new_proc_bind,
4846 #endif
4847  kmp_internal_control_t *new_icvs,
4848  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4849  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4850  int f;
4851  kmp_team_t *team;
4852  int use_hot_team = !root->r.r_active;
4853  int level = 0;
4854 
4855  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4856  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4857  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4858  KMP_MB();
4859 
4860 #if KMP_NESTED_HOT_TEAMS
4861  kmp_hot_team_ptr_t *hot_teams;
4862  if (master) {
4863  team = master->th.th_team;
4864  level = team->t.t_active_level;
4865  if (master->th.th_teams_microtask) { // in teams construct?
4866  if (master->th.th_teams_size.nteams > 1 &&
4867  ( // #teams > 1
4868  team->t.t_pkfn ==
4869  (microtask_t)__kmp_teams_master || // inner fork of the teams
4870  master->th.th_teams_level <
4871  team->t.t_level)) { // or nested parallel inside the teams
4872  ++level; // not increment if #teams==1, or for outer fork of the teams;
4873  // increment otherwise
4874  }
4875  }
4876  hot_teams = master->th.th_hot_teams;
4877  if (level < __kmp_hot_teams_max_level && hot_teams &&
4878  hot_teams[level]
4879  .hot_team) { // hot team has already been allocated for given level
4880  use_hot_team = 1;
4881  } else {
4882  use_hot_team = 0;
4883  }
4884  }
4885 #endif
4886  // Optimization to use a "hot" team
4887  if (use_hot_team && new_nproc > 1) {
4888  KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4889 #if KMP_NESTED_HOT_TEAMS
4890  team = hot_teams[level].hot_team;
4891 #else
4892  team = root->r.r_hot_team;
4893 #endif
4894 #if KMP_DEBUG
4895  if (__kmp_tasking_mode != tskm_immediate_exec) {
4896  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4897  "task_team[1] = %p before reinit\n",
4898  team->t.t_task_team[0], team->t.t_task_team[1]));
4899  }
4900 #endif
4901 
4902  // Has the number of threads changed?
4903  /* Let's assume the most common case is that the number of threads is
4904  unchanged, and put that case first. */
4905  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4906  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4907  // This case can mean that omp_set_num_threads() was called and the hot
4908  // team size was already reduced, so we check the special flag
4909  if (team->t.t_size_changed == -1) {
4910  team->t.t_size_changed = 1;
4911  } else {
4912  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4913  }
4914 
4915  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4916  kmp_r_sched_t new_sched = new_icvs->sched;
4917  // set master's schedule as new run-time schedule
4918  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4919 
4920  __kmp_reinitialize_team(team, new_icvs,
4921  root->r.r_uber_thread->th.th_ident);
4922 
4923  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4924  team->t.t_threads[0], team));
4925  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4926 
4927 #if OMP_40_ENABLED
4928 #if KMP_AFFINITY_SUPPORTED
4929  if ((team->t.t_size_changed == 0) &&
4930  (team->t.t_proc_bind == new_proc_bind)) {
4931  if (new_proc_bind == proc_bind_spread) {
4932  __kmp_partition_places(
4933  team, 1); // add flag to update only master for spread
4934  }
4935  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4936  "proc_bind = %d, partition = [%d,%d]\n",
4937  team->t.t_id, new_proc_bind, team->t.t_first_place,
4938  team->t.t_last_place));
4939  } else {
4940  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4941  __kmp_partition_places(team);
4942  }
4943 #else
4944  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4945 #endif /* KMP_AFFINITY_SUPPORTED */
4946 #endif /* OMP_40_ENABLED */
4947  } else if (team->t.t_nproc > new_nproc) {
4948  KA_TRACE(20,
4949  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4950  new_nproc));
4951 
4952  team->t.t_size_changed = 1;
4953 #if KMP_NESTED_HOT_TEAMS
4954  if (__kmp_hot_teams_mode == 0) {
4955  // AC: saved number of threads should correspond to team's value in this
4956  // mode, can be bigger in mode 1, when hot team has threads in reserve
4957  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4958  hot_teams[level].hot_team_nth = new_nproc;
4959 #endif // KMP_NESTED_HOT_TEAMS
4960  /* release the extra threads we don't need any more */
4961  for (f = new_nproc; f < team->t.t_nproc; f++) {
4962  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4963  if (__kmp_tasking_mode != tskm_immediate_exec) {
4964  // When decreasing team size, threads no longer in the team should
4965  // unref task team.
4966  team->t.t_threads[f]->th.th_task_team = NULL;
4967  }
4968  __kmp_free_thread(team->t.t_threads[f]);
4969  team->t.t_threads[f] = NULL;
4970  }
4971 #if KMP_NESTED_HOT_TEAMS
4972  } // (__kmp_hot_teams_mode == 0)
4973  else {
4974  // When keeping extra threads in team, switch threads to wait on own
4975  // b_go flag
4976  for (f = new_nproc; f < team->t.t_nproc; ++f) {
4977  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4978  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4979  for (int b = 0; b < bs_last_barrier; ++b) {
4980  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4981  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4982  }
4983  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4984  }
4985  }
4986  }
4987 #endif // KMP_NESTED_HOT_TEAMS
4988  team->t.t_nproc = new_nproc;
4989  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4990  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4991  __kmp_reinitialize_team(team, new_icvs,
4992  root->r.r_uber_thread->th.th_ident);
4993 
4994  /* update the remaining threads */
4995  for (f = 0; f < new_nproc; ++f) {
4996  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4997  }
4998  // restore the current task state of the master thread: should be the
4999  // implicit task
5000  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5001  team->t.t_threads[0], team));
5002 
5003  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5004 
5005 #ifdef KMP_DEBUG
5006  for (f = 0; f < team->t.t_nproc; f++) {
5007  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5008  team->t.t_threads[f]->th.th_team_nproc ==
5009  team->t.t_nproc);
5010  }
5011 #endif
5012 
5013 #if OMP_40_ENABLED
5014  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5015 #if KMP_AFFINITY_SUPPORTED
5016  __kmp_partition_places(team);
5017 #endif
5018 #endif
5019  } else { // team->t.t_nproc < new_nproc
5020 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5021  kmp_affin_mask_t *old_mask;
5022  if (KMP_AFFINITY_CAPABLE()) {
5023  KMP_CPU_ALLOC(old_mask);
5024  }
5025 #endif
5026 
5027  KA_TRACE(20,
5028  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5029  new_nproc));
5030 
5031  team->t.t_size_changed = 1;
5032 
5033 #if KMP_NESTED_HOT_TEAMS
5034  int avail_threads = hot_teams[level].hot_team_nth;
5035  if (new_nproc < avail_threads)
5036  avail_threads = new_nproc;
5037  kmp_info_t **other_threads = team->t.t_threads;
5038  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5039  // Adjust barrier data of reserved threads (if any) of the team
5040  // Other data will be set in __kmp_initialize_info() below.
5041  int b;
5042  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5043  for (b = 0; b < bs_last_barrier; ++b) {
5044  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5045  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5046 #if USE_DEBUGGER
5047  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5048 #endif
5049  }
5050  }
5051  if (hot_teams[level].hot_team_nth >= new_nproc) {
5052  // we have all needed threads in reserve, no need to allocate any
5053  // this only possible in mode 1, cannot have reserved threads in mode 0
5054  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5055  team->t.t_nproc = new_nproc; // just get reserved threads involved
5056  } else {
5057  // we may have some threads in reserve, but not enough
5058  team->t.t_nproc =
5059  hot_teams[level]
5060  .hot_team_nth; // get reserved threads involved if any
5061  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5062 #endif // KMP_NESTED_HOT_TEAMS
5063  if (team->t.t_max_nproc < new_nproc) {
5064  /* reallocate larger arrays */
5065  __kmp_reallocate_team_arrays(team, new_nproc);
5066  __kmp_reinitialize_team(team, new_icvs, NULL);
5067  }
5068 
5069 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5070  /* Temporarily set full mask for master thread before creation of
5071  workers. The reason is that workers inherit the affinity from master,
5072  so if a lot of workers are created on the single core quickly, they
5073  don't get a chance to set their own affinity for a long time. */
5074  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5075 #endif
5076 
5077  /* allocate new threads for the hot team */
5078  for (f = team->t.t_nproc; f < new_nproc; f++) {
5079  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5080  KMP_DEBUG_ASSERT(new_worker);
5081  team->t.t_threads[f] = new_worker;
5082 
5083  KA_TRACE(20,
5084  ("__kmp_allocate_team: team %d init T#%d arrived: "
5085  "join=%llu, plain=%llu\n",
5086  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5087  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5088  team->t.t_bar[bs_plain_barrier].b_arrived));
5089 
5090  { // Initialize barrier data for new threads.
5091  int b;
5092  kmp_balign_t *balign = new_worker->th.th_bar;
5093  for (b = 0; b < bs_last_barrier; ++b) {
5094  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5095  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5096  KMP_BARRIER_PARENT_FLAG);
5097 #if USE_DEBUGGER
5098  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5099 #endif
5100  }
5101  }
5102  }
5103 
5104 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5105  if (KMP_AFFINITY_CAPABLE()) {
5106  /* Restore initial master thread's affinity mask */
5107  __kmp_set_system_affinity(old_mask, TRUE);
5108  KMP_CPU_FREE(old_mask);
5109  }
5110 #endif
5111 #if KMP_NESTED_HOT_TEAMS
5112  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5113 #endif // KMP_NESTED_HOT_TEAMS
5114  /* make sure everyone is syncronized */
5115  int old_nproc = team->t.t_nproc; // save old value and use to update only
5116  // new threads below
5117  __kmp_initialize_team(team, new_nproc, new_icvs,
5118  root->r.r_uber_thread->th.th_ident);
5119 
5120  /* reinitialize the threads */
5121  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5122  for (f = 0; f < team->t.t_nproc; ++f)
5123  __kmp_initialize_info(team->t.t_threads[f], team, f,
5124  __kmp_gtid_from_tid(f, team));
5125  if (level) { // set th_task_state for new threads in nested hot team
5126  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5127  // only need to set the th_task_state for the new threads. th_task_state
5128  // for master thread will not be accurate until after this in
5129  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5130  // correct value.
5131  for (f = old_nproc; f < team->t.t_nproc; ++f)
5132  team->t.t_threads[f]->th.th_task_state =
5133  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5134  } else { // set th_task_state for new threads in non-nested hot team
5135  int old_state =
5136  team->t.t_threads[0]->th.th_task_state; // copy master's state
5137  for (f = old_nproc; f < team->t.t_nproc; ++f)
5138  team->t.t_threads[f]->th.th_task_state = old_state;
5139  }
5140 
5141 #ifdef KMP_DEBUG
5142  for (f = 0; f < team->t.t_nproc; ++f) {
5143  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5144  team->t.t_threads[f]->th.th_team_nproc ==
5145  team->t.t_nproc);
5146  }
5147 #endif
5148 
5149 #if OMP_40_ENABLED
5150  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5151 #if KMP_AFFINITY_SUPPORTED
5152  __kmp_partition_places(team);
5153 #endif
5154 #endif
5155  } // Check changes in number of threads
5156 
5157 #if OMP_40_ENABLED
5158  kmp_info_t *master = team->t.t_threads[0];
5159  if (master->th.th_teams_microtask) {
5160  for (f = 1; f < new_nproc; ++f) {
5161  // propagate teams construct specific info to workers
5162  kmp_info_t *thr = team->t.t_threads[f];
5163  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5164  thr->th.th_teams_level = master->th.th_teams_level;
5165  thr->th.th_teams_size = master->th.th_teams_size;
5166  }
5167  }
5168 #endif /* OMP_40_ENABLED */
5169 #if KMP_NESTED_HOT_TEAMS
5170  if (level) {
5171  // Sync barrier state for nested hot teams, not needed for outermost hot
5172  // team.
5173  for (f = 1; f < new_nproc; ++f) {
5174  kmp_info_t *thr = team->t.t_threads[f];
5175  int b;
5176  kmp_balign_t *balign = thr->th.th_bar;
5177  for (b = 0; b < bs_last_barrier; ++b) {
5178  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5179  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5180 #if USE_DEBUGGER
5181  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5182 #endif
5183  }
5184  }
5185  }
5186 #endif // KMP_NESTED_HOT_TEAMS
5187 
5188  /* reallocate space for arguments if necessary */
5189  __kmp_alloc_argv_entries(argc, team, TRUE);
5190  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5191  // The hot team re-uses the previous task team,
5192  // if untouched during the previous release->gather phase.
5193 
5194  KF_TRACE(10, (" hot_team = %p\n", team));
5195 
5196 #if KMP_DEBUG
5197  if (__kmp_tasking_mode != tskm_immediate_exec) {
5198  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5199  "task_team[1] = %p after reinit\n",
5200  team->t.t_task_team[0], team->t.t_task_team[1]));
5201  }
5202 #endif
5203 
5204 #if OMPT_SUPPORT
5205  __ompt_team_assign_id(team, ompt_parallel_data);
5206 #endif
5207 
5208  KMP_MB();
5209 
5210  return team;
5211  }
5212 
5213  /* next, let's try to take one from the team pool */
5214  KMP_MB();
5215  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5216  /* TODO: consider resizing undersized teams instead of reaping them, now
5217  that we have a resizing mechanism */
5218  if (team->t.t_max_nproc >= max_nproc) {
5219  /* take this team from the team pool */
5220  __kmp_team_pool = team->t.t_next_pool;
5221 
5222  /* setup the team for fresh use */
5223  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5224 
5225  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5226  "task_team[1] %p to NULL\n",
5227  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5228  team->t.t_task_team[0] = NULL;
5229  team->t.t_task_team[1] = NULL;
5230 
5231  /* reallocate space for arguments if necessary */
5232  __kmp_alloc_argv_entries(argc, team, TRUE);
5233  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5234 
5235  KA_TRACE(
5236  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5237  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5238  { // Initialize barrier data.
5239  int b;
5240  for (b = 0; b < bs_last_barrier; ++b) {
5241  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5242 #if USE_DEBUGGER
5243  team->t.t_bar[b].b_master_arrived = 0;
5244  team->t.t_bar[b].b_team_arrived = 0;
5245 #endif
5246  }
5247  }
5248 
5249 #if OMP_40_ENABLED
5250  team->t.t_proc_bind = new_proc_bind;
5251 #endif
5252 
5253  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5254  team->t.t_id));
5255 
5256 #if OMPT_SUPPORT
5257  __ompt_team_assign_id(team, ompt_parallel_data);
5258 #endif
5259 
5260  KMP_MB();
5261 
5262  return team;
5263  }
5264 
5265  /* reap team if it is too small, then loop back and check the next one */
5266  // not sure if this is wise, but, will be redone during the hot-teams
5267  // rewrite.
5268  /* TODO: Use technique to find the right size hot-team, don't reap them */
5269  team = __kmp_reap_team(team);
5270  __kmp_team_pool = team;
5271  }
5272 
5273  /* nothing available in the pool, no matter, make a new team! */
5274  KMP_MB();
5275  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5276 
5277  /* and set it up */
5278  team->t.t_max_nproc = max_nproc;
5279  /* NOTE well, for some reason allocating one big buffer and dividing it up
5280  seems to really hurt performance a lot on the P4, so, let's not use this */
5281  __kmp_allocate_team_arrays(team, max_nproc);
5282 
5283  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5284  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5285 
5286  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5287  "%p to NULL\n",
5288  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5289  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5290  // memory, no need to duplicate
5291  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5292  // memory, no need to duplicate
5293 
5294  if (__kmp_storage_map) {
5295  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5296  }
5297 
5298  /* allocate space for arguments */
5299  __kmp_alloc_argv_entries(argc, team, FALSE);
5300  team->t.t_argc = argc;
5301 
5302  KA_TRACE(20,
5303  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5304  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5305  { // Initialize barrier data.
5306  int b;
5307  for (b = 0; b < bs_last_barrier; ++b) {
5308  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5309 #if USE_DEBUGGER
5310  team->t.t_bar[b].b_master_arrived = 0;
5311  team->t.t_bar[b].b_team_arrived = 0;
5312 #endif
5313  }
5314  }
5315 
5316 #if OMP_40_ENABLED
5317  team->t.t_proc_bind = new_proc_bind;
5318 #endif
5319 
5320 #if OMPT_SUPPORT
5321  __ompt_team_assign_id(team, ompt_parallel_data);
5322  team->t.ompt_serialized_team_info = NULL;
5323 #endif
5324 
5325  KMP_MB();
5326 
5327  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5328  team->t.t_id));
5329 
5330  return team;
5331 }
5332 
5333 /* TODO implement hot-teams at all levels */
5334 /* TODO implement lazy thread release on demand (disband request) */
5335 
5336 /* free the team. return it to the team pool. release all the threads
5337  * associated with it */
5338 void __kmp_free_team(kmp_root_t *root,
5339  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5340  int f;
5341  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5342  team->t.t_id));
5343 
5344  /* verify state */
5345  KMP_DEBUG_ASSERT(root);
5346  KMP_DEBUG_ASSERT(team);
5347  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5348  KMP_DEBUG_ASSERT(team->t.t_threads);
5349 
5350  int use_hot_team = team == root->r.r_hot_team;
5351 #if KMP_NESTED_HOT_TEAMS
5352  int level;
5353  kmp_hot_team_ptr_t *hot_teams;
5354  if (master) {
5355  level = team->t.t_active_level - 1;
5356  if (master->th.th_teams_microtask) { // in teams construct?
5357  if (master->th.th_teams_size.nteams > 1) {
5358  ++level; // level was not increased in teams construct for
5359  // team_of_masters
5360  }
5361  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5362  master->th.th_teams_level == team->t.t_level) {
5363  ++level; // level was not increased in teams construct for
5364  // team_of_workers before the parallel
5365  } // team->t.t_level will be increased inside parallel
5366  }
5367  hot_teams = master->th.th_hot_teams;
5368  if (level < __kmp_hot_teams_max_level) {
5369  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5370  use_hot_team = 1;
5371  }
5372  }
5373 #endif // KMP_NESTED_HOT_TEAMS
5374 
5375  /* team is done working */
5376  TCW_SYNC_PTR(team->t.t_pkfn,
5377  NULL); // Important for Debugging Support Library.
5378 #if KMP_OS_WINDOWS
5379  team->t.t_copyin_counter = 0; // init counter for possible reuse
5380 #endif
5381  // Do not reset pointer to parent team to NULL for hot teams.
5382 
5383  /* if we are non-hot team, release our threads */
5384  if (!use_hot_team) {
5385  if (__kmp_tasking_mode != tskm_immediate_exec) {
5386  // Wait for threads to reach reapable state
5387  for (f = 1; f < team->t.t_nproc; ++f) {
5388  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5389  kmp_info_t *th = team->t.t_threads[f];
5390  volatile kmp_uint32 *state = &th->th.th_reap_state;
5391  while (*state != KMP_SAFE_TO_REAP) {
5392 #if KMP_OS_WINDOWS
5393  // On Windows a thread can be killed at any time, check this
5394  DWORD ecode;
5395  if (!__kmp_is_thread_alive(th, &ecode)) {
5396  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5397  break;
5398  }
5399 #endif
5400  // first check if thread is sleeping
5401  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5402  if (fl.is_sleeping())
5403  fl.resume(__kmp_gtid_from_thread(th));
5404  KMP_CPU_PAUSE();
5405  }
5406  }
5407 
5408  // Delete task teams
5409  int tt_idx;
5410  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5411  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5412  if (task_team != NULL) {
5413  for (f = 0; f < team->t.t_nproc;
5414  ++f) { // Have all threads unref task teams
5415  team->t.t_threads[f]->th.th_task_team = NULL;
5416  }
5417  KA_TRACE(
5418  20,
5419  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5420  __kmp_get_gtid(), task_team, team->t.t_id));
5421 #if KMP_NESTED_HOT_TEAMS
5422  __kmp_free_task_team(master, task_team);
5423 #endif
5424  team->t.t_task_team[tt_idx] = NULL;
5425  }
5426  }
5427  }
5428 
5429  // Reset pointer to parent team only for non-hot teams.
5430  team->t.t_parent = NULL;
5431  team->t.t_level = 0;
5432  team->t.t_active_level = 0;
5433 
5434  /* free the worker threads */
5435  for (f = 1; f < team->t.t_nproc; ++f) {
5436  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5437  __kmp_free_thread(team->t.t_threads[f]);
5438  team->t.t_threads[f] = NULL;
5439  }
5440 
5441  /* put the team back in the team pool */
5442  /* TODO limit size of team pool, call reap_team if pool too large */
5443  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5444  __kmp_team_pool = (volatile kmp_team_t *)team;
5445  }
5446 
5447  KMP_MB();
5448 }
5449 
5450 /* reap the team. destroy it, reclaim all its resources and free its memory */
5451 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5452  kmp_team_t *next_pool = team->t.t_next_pool;
5453 
5454  KMP_DEBUG_ASSERT(team);
5455  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5456  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5457  KMP_DEBUG_ASSERT(team->t.t_threads);
5458  KMP_DEBUG_ASSERT(team->t.t_argv);
5459 
5460  /* TODO clean the threads that are a part of this? */
5461 
5462  /* free stuff */
5463  __kmp_free_team_arrays(team);
5464  if (team->t.t_argv != &team->t.t_inline_argv[0])
5465  __kmp_free((void *)team->t.t_argv);
5466  __kmp_free(team);
5467 
5468  KMP_MB();
5469  return next_pool;
5470 }
5471 
5472 // Free the thread. Don't reap it, just place it on the pool of available
5473 // threads.
5474 //
5475 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5476 // binding for the affinity mechanism to be useful.
5477 //
5478 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5479 // However, we want to avoid a potential performance problem by always
5480 // scanning through the list to find the correct point at which to insert
5481 // the thread (potential N**2 behavior). To do this we keep track of the
5482 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5483 // With single-level parallelism, threads will always be added to the tail
5484 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5485 // parallelism, all bets are off and we may need to scan through the entire
5486 // free list.
5487 //
5488 // This change also has a potentially large performance benefit, for some
5489 // applications. Previously, as threads were freed from the hot team, they
5490 // would be placed back on the free list in inverse order. If the hot team
5491 // grew back to it's original size, then the freed thread would be placed
5492 // back on the hot team in reverse order. This could cause bad cache
5493 // locality problems on programs where the size of the hot team regularly
5494 // grew and shrunk.
5495 //
5496 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5497 void __kmp_free_thread(kmp_info_t *this_th) {
5498  int gtid;
5499  kmp_info_t **scan;
5500  kmp_root_t *root = this_th->th.th_root;
5501 
5502  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5503  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5504 
5505  KMP_DEBUG_ASSERT(this_th);
5506 
5507  // When moving thread to pool, switch thread to wait on own b_go flag, and
5508  // uninitialized (NULL team).
5509  int b;
5510  kmp_balign_t *balign = this_th->th.th_bar;
5511  for (b = 0; b < bs_last_barrier; ++b) {
5512  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5513  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5514  balign[b].bb.team = NULL;
5515  balign[b].bb.leaf_kids = 0;
5516  }
5517  this_th->th.th_task_state = 0;
5518  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5519 
5520  /* put thread back on the free pool */
5521  TCW_PTR(this_th->th.th_team, NULL);
5522  TCW_PTR(this_th->th.th_root, NULL);
5523  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5524 
5525  /* If the implicit task assigned to this thread can be used by other threads
5526  * -> multiple threads can share the data and try to free the task at
5527  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5528  * with higher probability when hot team is disabled but can occurs even when
5529  * the hot team is enabled */
5530  __kmp_free_implicit_task(this_th);
5531  this_th->th.th_current_task = NULL;
5532 
5533  // If the __kmp_thread_pool_insert_pt is already past the new insert
5534  // point, then we need to re-scan the entire list.
5535  gtid = this_th->th.th_info.ds.ds_gtid;
5536  if (__kmp_thread_pool_insert_pt != NULL) {
5537  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5538  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5539  __kmp_thread_pool_insert_pt = NULL;
5540  }
5541  }
5542 
5543  // Scan down the list to find the place to insert the thread.
5544  // scan is the address of a link in the list, possibly the address of
5545  // __kmp_thread_pool itself.
5546  //
5547  // In the absence of nested parallism, the for loop will have 0 iterations.
5548  if (__kmp_thread_pool_insert_pt != NULL) {
5549  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5550  } else {
5551  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5552  }
5553  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5554  scan = &((*scan)->th.th_next_pool))
5555  ;
5556 
5557  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5558  // to its address.
5559  TCW_PTR(this_th->th.th_next_pool, *scan);
5560  __kmp_thread_pool_insert_pt = *scan = this_th;
5561  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5562  (this_th->th.th_info.ds.ds_gtid <
5563  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5564  TCW_4(this_th->th.th_in_pool, TRUE);
5565  __kmp_thread_pool_nth++;
5566 
5567  TCW_4(__kmp_nth, __kmp_nth - 1);
5568  root->r.r_cg_nthreads--;
5569 
5570 #ifdef KMP_ADJUST_BLOCKTIME
5571  /* Adjust blocktime back to user setting or default if necessary */
5572  /* Middle initialization might never have occurred */
5573  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5574  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5575  if (__kmp_nth <= __kmp_avail_proc) {
5576  __kmp_zero_bt = FALSE;
5577  }
5578  }
5579 #endif /* KMP_ADJUST_BLOCKTIME */
5580 
5581  KMP_MB();
5582 }
5583 
5584 /* ------------------------------------------------------------------------ */
5585 
5586 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5587  int gtid = this_thr->th.th_info.ds.ds_gtid;
5588  /* void *stack_data;*/
5589  kmp_team_t *(*volatile pteam);
5590 
5591  KMP_MB();
5592  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5593 
5594  if (__kmp_env_consistency_check) {
5595  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5596  }
5597 
5598 #if OMPT_SUPPORT
5599  ompt_data_t *thread_data;
5600  if (ompt_enabled.enabled) {
5601  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5602  *thread_data = ompt_data_none;
5603 
5604  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5605  this_thr->th.ompt_thread_info.wait_id = 0;
5606  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5607  if (ompt_enabled.ompt_callback_thread_begin) {
5608  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5609  ompt_thread_worker, thread_data);
5610  }
5611  }
5612 #endif
5613 
5614 #if OMPT_SUPPORT
5615  if (ompt_enabled.enabled) {
5616  this_thr->th.ompt_thread_info.state = omp_state_idle;
5617  }
5618 #endif
5619  /* This is the place where threads wait for work */
5620  while (!TCR_4(__kmp_global.g.g_done)) {
5621  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5622  KMP_MB();
5623 
5624  /* wait for work to do */
5625  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5626 
5627  /* No tid yet since not part of a team */
5628  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5629 
5630 #if OMPT_SUPPORT
5631  if (ompt_enabled.enabled) {
5632  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5633  }
5634 #endif
5635 
5636  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5637 
5638  /* have we been allocated? */
5639  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5640  /* we were just woken up, so run our new task */
5641  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5642  int rc;
5643  KA_TRACE(20,
5644  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5645  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5646  (*pteam)->t.t_pkfn));
5647 
5648  updateHWFPControl(*pteam);
5649 
5650 #if OMPT_SUPPORT
5651  if (ompt_enabled.enabled) {
5652  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5653  }
5654 #endif
5655 
5656  rc = (*pteam)->t.t_invoke(gtid);
5657  KMP_ASSERT(rc);
5658 
5659  KMP_MB();
5660  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5661  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5662  (*pteam)->t.t_pkfn));
5663  }
5664 #if OMPT_SUPPORT
5665  if (ompt_enabled.enabled) {
5666  /* no frame set while outside task */
5667  __ompt_get_task_info_object(0)->frame.exit_frame = NULL;
5668 
5669  this_thr->th.ompt_thread_info.state = omp_state_overhead;
5670  }
5671 #endif
5672  /* join barrier after parallel region */
5673  __kmp_join_barrier(gtid);
5674  }
5675  }
5676  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5677 
5678 #if OMPT_SUPPORT
5679  if (ompt_enabled.ompt_callback_thread_end) {
5680  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5681  }
5682 #endif
5683 
5684  this_thr->th.th_task_team = NULL;
5685  /* run the destructors for the threadprivate data for this thread */
5686  __kmp_common_destroy_gtid(gtid);
5687 
5688  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5689  KMP_MB();
5690  return this_thr;
5691 }
5692 
5693 /* ------------------------------------------------------------------------ */
5694 
5695 void __kmp_internal_end_dest(void *specific_gtid) {
5696 #if KMP_COMPILER_ICC
5697 #pragma warning(push)
5698 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5699 // significant bits
5700 #endif
5701  // Make sure no significant bits are lost
5702  int gtid = (kmp_intptr_t)specific_gtid - 1;
5703 #if KMP_COMPILER_ICC
5704 #pragma warning(pop)
5705 #endif
5706 
5707  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5708  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5709  * this is because 0 is reserved for the nothing-stored case */
5710 
5711  /* josh: One reason for setting the gtid specific data even when it is being
5712  destroyed by pthread is to allow gtid lookup through thread specific data
5713  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5714  that gets executed in the call to __kmp_internal_end_thread, actually
5715  gets the gtid through the thread specific data. Setting it here seems
5716  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5717  to run smoothly.
5718  todo: get rid of this after we remove the dependence on
5719  __kmp_gtid_get_specific */
5720  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5721  __kmp_gtid_set_specific(gtid);
5722 #ifdef KMP_TDATA_GTID
5723  __kmp_gtid = gtid;
5724 #endif
5725  __kmp_internal_end_thread(gtid);
5726 }
5727 
5728 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5729 
5730 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5731 // destructors work perfectly, but in real libomp.so I have no evidence it is
5732 // ever called. However, -fini linker option in makefile.mk works fine.
5733 
5734 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5735  __kmp_internal_end_atexit();
5736 }
5737 
5738 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5739 
5740 #endif
5741 
5742 /* [Windows] josh: when the atexit handler is called, there may still be more
5743  than one thread alive */
5744 void __kmp_internal_end_atexit(void) {
5745  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5746  /* [Windows]
5747  josh: ideally, we want to completely shutdown the library in this atexit
5748  handler, but stat code that depends on thread specific data for gtid fails
5749  because that data becomes unavailable at some point during the shutdown, so
5750  we call __kmp_internal_end_thread instead. We should eventually remove the
5751  dependency on __kmp_get_specific_gtid in the stat code and use
5752  __kmp_internal_end_library to cleanly shutdown the library.
5753 
5754  // TODO: Can some of this comment about GVS be removed?
5755  I suspect that the offending stat code is executed when the calling thread
5756  tries to clean up a dead root thread's data structures, resulting in GVS
5757  code trying to close the GVS structures for that thread, but since the stat
5758  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5759  the calling thread is cleaning up itself instead of another thread, it get
5760  confused. This happens because allowing a thread to unregister and cleanup
5761  another thread is a recent modification for addressing an issue.
5762  Based on the current design (20050722), a thread may end up
5763  trying to unregister another thread only if thread death does not trigger
5764  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5765  thread specific data destructor function to detect thread death. For
5766  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5767  is nothing. Thus, the workaround is applicable only for Windows static
5768  stat library. */
5769  __kmp_internal_end_library(-1);
5770 #if KMP_OS_WINDOWS
5771  __kmp_close_console();
5772 #endif
5773 }
5774 
5775 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5776  // It is assumed __kmp_forkjoin_lock is acquired.
5777 
5778  int gtid;
5779 
5780  KMP_DEBUG_ASSERT(thread != NULL);
5781 
5782  gtid = thread->th.th_info.ds.ds_gtid;
5783 
5784  if (!is_root) {
5785 
5786  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5787  /* Assume the threads are at the fork barrier here */
5788  KA_TRACE(
5789  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5790  gtid));
5791  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5792  * (GEH) */
5793  ANNOTATE_HAPPENS_BEFORE(thread);
5794  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5795  __kmp_release_64(&flag);
5796  }
5797 
5798  // Terminate OS thread.
5799  __kmp_reap_worker(thread);
5800 
5801  // The thread was killed asynchronously. If it was actively
5802  // spinning in the thread pool, decrement the global count.
5803  //
5804  // There is a small timing hole here - if the worker thread was just waking
5805  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5806  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5807  // the global counter might not get updated.
5808  //
5809  // Currently, this can only happen as the library is unloaded,
5810  // so there are no harmful side effects.
5811  if (thread->th.th_active_in_pool) {
5812  thread->th.th_active_in_pool = FALSE;
5813  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5814  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5815  }
5816 
5817  // Decrement # of [worker] threads in the pool.
5818  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5819  --__kmp_thread_pool_nth;
5820  }
5821 
5822  __kmp_free_implicit_task(thread);
5823 
5824 // Free the fast memory for tasking
5825 #if USE_FAST_MEMORY
5826  __kmp_free_fast_memory(thread);
5827 #endif /* USE_FAST_MEMORY */
5828 
5829  __kmp_suspend_uninitialize_thread(thread);
5830 
5831  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5832  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5833 
5834  --__kmp_all_nth;
5835 // __kmp_nth was decremented when thread is added to the pool.
5836 
5837 #ifdef KMP_ADJUST_BLOCKTIME
5838  /* Adjust blocktime back to user setting or default if necessary */
5839  /* Middle initialization might never have occurred */
5840  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5841  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5842  if (__kmp_nth <= __kmp_avail_proc) {
5843  __kmp_zero_bt = FALSE;
5844  }
5845  }
5846 #endif /* KMP_ADJUST_BLOCKTIME */
5847 
5848  /* free the memory being used */
5849  if (__kmp_env_consistency_check) {
5850  if (thread->th.th_cons) {
5851  __kmp_free_cons_stack(thread->th.th_cons);
5852  thread->th.th_cons = NULL;
5853  }
5854  }
5855 
5856  if (thread->th.th_pri_common != NULL) {
5857  __kmp_free(thread->th.th_pri_common);
5858  thread->th.th_pri_common = NULL;
5859  }
5860 
5861  if (thread->th.th_task_state_memo_stack != NULL) {
5862  __kmp_free(thread->th.th_task_state_memo_stack);
5863  thread->th.th_task_state_memo_stack = NULL;
5864  }
5865 
5866 #if KMP_USE_BGET
5867  if (thread->th.th_local.bget_data != NULL) {
5868  __kmp_finalize_bget(thread);
5869  }
5870 #endif
5871 
5872 #if KMP_AFFINITY_SUPPORTED
5873  if (thread->th.th_affin_mask != NULL) {
5874  KMP_CPU_FREE(thread->th.th_affin_mask);
5875  thread->th.th_affin_mask = NULL;
5876  }
5877 #endif /* KMP_AFFINITY_SUPPORTED */
5878 
5879 #if KMP_USE_HIER_SCHED
5880  if (thread->th.th_hier_bar_data != NULL) {
5881  __kmp_free(thread->th.th_hier_bar_data);
5882  thread->th.th_hier_bar_data = NULL;
5883  }
5884 #endif
5885 
5886  __kmp_reap_team(thread->th.th_serial_team);
5887  thread->th.th_serial_team = NULL;
5888  __kmp_free(thread);
5889 
5890  KMP_MB();
5891 
5892 } // __kmp_reap_thread
5893 
5894 static void __kmp_internal_end(void) {
5895  int i;
5896 
5897  /* First, unregister the library */
5898  __kmp_unregister_library();
5899 
5900 #if KMP_OS_WINDOWS
5901  /* In Win static library, we can't tell when a root actually dies, so we
5902  reclaim the data structures for any root threads that have died but not
5903  unregistered themselves, in order to shut down cleanly.
5904  In Win dynamic library we also can't tell when a thread dies. */
5905  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5906 // dead roots
5907 #endif
5908 
5909  for (i = 0; i < __kmp_threads_capacity; i++)
5910  if (__kmp_root[i])
5911  if (__kmp_root[i]->r.r_active)
5912  break;
5913  KMP_MB(); /* Flush all pending memory write invalidates. */
5914  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5915 
5916  if (i < __kmp_threads_capacity) {
5917 #if KMP_USE_MONITOR
5918  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5919  KMP_MB(); /* Flush all pending memory write invalidates. */
5920 
5921  // Need to check that monitor was initialized before reaping it. If we are
5922  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5923  // __kmp_monitor will appear to contain valid data, but it is only valid in
5924  // the parent process, not the child.
5925  // New behavior (201008): instead of keying off of the flag
5926  // __kmp_init_parallel, the monitor thread creation is keyed off
5927  // of the new flag __kmp_init_monitor.
5928  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5929  if (TCR_4(__kmp_init_monitor)) {
5930  __kmp_reap_monitor(&__kmp_monitor);
5931  TCW_4(__kmp_init_monitor, 0);
5932  }
5933  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5934  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5935 #endif // KMP_USE_MONITOR
5936  } else {
5937 /* TODO move this to cleanup code */
5938 #ifdef KMP_DEBUG
5939  /* make sure that everything has properly ended */
5940  for (i = 0; i < __kmp_threads_capacity; i++) {
5941  if (__kmp_root[i]) {
5942  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
5943  // there can be uber threads alive here
5944  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5945  }
5946  }
5947 #endif
5948 
5949  KMP_MB();
5950 
5951  // Reap the worker threads.
5952  // This is valid for now, but be careful if threads are reaped sooner.
5953  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5954  // Get the next thread from the pool.
5955  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5956  __kmp_thread_pool = thread->th.th_next_pool;
5957  // Reap it.
5958  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5959  thread->th.th_next_pool = NULL;
5960  thread->th.th_in_pool = FALSE;
5961  __kmp_reap_thread(thread, 0);
5962  }
5963  __kmp_thread_pool_insert_pt = NULL;
5964 
5965  // Reap teams.
5966  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5967  // Get the next team from the pool.
5968  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5969  __kmp_team_pool = team->t.t_next_pool;
5970  // Reap it.
5971  team->t.t_next_pool = NULL;
5972  __kmp_reap_team(team);
5973  }
5974 
5975  __kmp_reap_task_teams();
5976 
5977 #if KMP_OS_UNIX
5978  // Threads that are not reaped should not access any resources since they
5979  // are going to be deallocated soon, so the shutdown sequence should wait
5980  // until all threads either exit the final spin-waiting loop or begin
5981  // sleeping after the given blocktime.
5982  for (i = 0; i < __kmp_threads_capacity; i++) {
5983  kmp_info_t *thr = __kmp_threads[i];
5984  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
5985  KMP_CPU_PAUSE();
5986  }
5987 #endif
5988 
5989  for (i = 0; i < __kmp_threads_capacity; ++i) {
5990  // TBD: Add some checking...
5991  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5992  }
5993 
5994  /* Make sure all threadprivate destructors get run by joining with all
5995  worker threads before resetting this flag */
5996  TCW_SYNC_4(__kmp_init_common, FALSE);
5997 
5998  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5999  KMP_MB();
6000 
6001 #if KMP_USE_MONITOR
6002  // See note above: One of the possible fixes for CQ138434 / CQ140126
6003  //
6004  // FIXME: push both code fragments down and CSE them?
6005  // push them into __kmp_cleanup() ?
6006  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6007  if (TCR_4(__kmp_init_monitor)) {
6008  __kmp_reap_monitor(&__kmp_monitor);
6009  TCW_4(__kmp_init_monitor, 0);
6010  }
6011  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6012  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6013 #endif
6014  } /* else !__kmp_global.t_active */
6015  TCW_4(__kmp_init_gtid, FALSE);
6016  KMP_MB(); /* Flush all pending memory write invalidates. */
6017 
6018  __kmp_cleanup();
6019 #if OMPT_SUPPORT
6020  ompt_fini();
6021 #endif
6022 }
6023 
6024 void __kmp_internal_end_library(int gtid_req) {
6025  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6026  /* this shouldn't be a race condition because __kmp_internal_end() is the
6027  only place to clear __kmp_serial_init */
6028  /* we'll check this later too, after we get the lock */
6029  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6030  // redundaant, because the next check will work in any case.
6031  if (__kmp_global.g.g_abort) {
6032  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6033  /* TODO abort? */
6034  return;
6035  }
6036  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6037  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6038  return;
6039  }
6040 
6041  KMP_MB(); /* Flush all pending memory write invalidates. */
6042 
6043  /* find out who we are and what we should do */
6044  {
6045  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6046  KA_TRACE(
6047  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6048  if (gtid == KMP_GTID_SHUTDOWN) {
6049  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6050  "already shutdown\n"));
6051  return;
6052  } else if (gtid == KMP_GTID_MONITOR) {
6053  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6054  "registered, or system shutdown\n"));
6055  return;
6056  } else if (gtid == KMP_GTID_DNE) {
6057  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6058  "shutdown\n"));
6059  /* we don't know who we are, but we may still shutdown the library */
6060  } else if (KMP_UBER_GTID(gtid)) {
6061  /* unregister ourselves as an uber thread. gtid is no longer valid */
6062  if (__kmp_root[gtid]->r.r_active) {
6063  __kmp_global.g.g_abort = -1;
6064  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6065  KA_TRACE(10,
6066  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6067  gtid));
6068  return;
6069  } else {
6070  KA_TRACE(
6071  10,
6072  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6073  __kmp_unregister_root_current_thread(gtid);
6074  }
6075  } else {
6076 /* worker threads may call this function through the atexit handler, if they
6077  * call exit() */
6078 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6079  TODO: do a thorough shutdown instead */
6080 #ifdef DUMP_DEBUG_ON_EXIT
6081  if (__kmp_debug_buf)
6082  __kmp_dump_debug_buffer();
6083 #endif
6084  return;
6085  }
6086  }
6087  /* synchronize the termination process */
6088  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6089 
6090  /* have we already finished */
6091  if (__kmp_global.g.g_abort) {
6092  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6093  /* TODO abort? */
6094  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6095  return;
6096  }
6097  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6098  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6099  return;
6100  }
6101 
6102  /* We need this lock to enforce mutex between this reading of
6103  __kmp_threads_capacity and the writing by __kmp_register_root.
6104  Alternatively, we can use a counter of roots that is atomically updated by
6105  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6106  __kmp_internal_end_*. */
6107  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6108 
6109  /* now we can safely conduct the actual termination */
6110  __kmp_internal_end();
6111 
6112  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6113  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6114 
6115  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6116 
6117 #ifdef DUMP_DEBUG_ON_EXIT
6118  if (__kmp_debug_buf)
6119  __kmp_dump_debug_buffer();
6120 #endif
6121 
6122 #if KMP_OS_WINDOWS
6123  __kmp_close_console();
6124 #endif
6125 
6126  __kmp_fini_allocator();
6127 
6128 } // __kmp_internal_end_library
6129 
6130 void __kmp_internal_end_thread(int gtid_req) {
6131  int i;
6132 
6133  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6134  /* this shouldn't be a race condition because __kmp_internal_end() is the
6135  * only place to clear __kmp_serial_init */
6136  /* we'll check this later too, after we get the lock */
6137  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6138  // redundant, because the next check will work in any case.
6139  if (__kmp_global.g.g_abort) {
6140  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6141  /* TODO abort? */
6142  return;
6143  }
6144  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6145  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6146  return;
6147  }
6148 
6149  KMP_MB(); /* Flush all pending memory write invalidates. */
6150 
6151  /* find out who we are and what we should do */
6152  {
6153  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6154  KA_TRACE(10,
6155  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6156  if (gtid == KMP_GTID_SHUTDOWN) {
6157  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6158  "already shutdown\n"));
6159  return;
6160  } else if (gtid == KMP_GTID_MONITOR) {
6161  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6162  "registered, or system shutdown\n"));
6163  return;
6164  } else if (gtid == KMP_GTID_DNE) {
6165  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6166  "shutdown\n"));
6167  return;
6168  /* we don't know who we are */
6169  } else if (KMP_UBER_GTID(gtid)) {
6170  /* unregister ourselves as an uber thread. gtid is no longer valid */
6171  if (__kmp_root[gtid]->r.r_active) {
6172  __kmp_global.g.g_abort = -1;
6173  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6174  KA_TRACE(10,
6175  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6176  gtid));
6177  return;
6178  } else {
6179  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6180  gtid));
6181  __kmp_unregister_root_current_thread(gtid);
6182  }
6183  } else {
6184  /* just a worker thread, let's leave */
6185  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6186 
6187  if (gtid >= 0) {
6188  __kmp_threads[gtid]->th.th_task_team = NULL;
6189  }
6190 
6191  KA_TRACE(10,
6192  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6193  gtid));
6194  return;
6195  }
6196  }
6197 #if KMP_DYNAMIC_LIB
6198  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6199  // thread, because we will better shutdown later in the library destructor.
6200  // The reason of this change is performance problem when non-openmp thread in
6201  // a loop forks and joins many openmp threads. We can save a lot of time
6202  // keeping worker threads alive until the program shutdown.
6203  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6204  // and Windows(DPD200287443) that occurs when using critical sections from
6205  // foreign threads.
6206  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6207  return;
6208 #endif
6209  /* synchronize the termination process */
6210  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6211 
6212  /* have we already finished */
6213  if (__kmp_global.g.g_abort) {
6214  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6215  /* TODO abort? */
6216  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6217  return;
6218  }
6219  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6220  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6221  return;
6222  }
6223 
6224  /* We need this lock to enforce mutex between this reading of
6225  __kmp_threads_capacity and the writing by __kmp_register_root.
6226  Alternatively, we can use a counter of roots that is atomically updated by
6227  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6228  __kmp_internal_end_*. */
6229 
6230  /* should we finish the run-time? are all siblings done? */
6231  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6232 
6233  for (i = 0; i < __kmp_threads_capacity; ++i) {
6234  if (KMP_UBER_GTID(i)) {
6235  KA_TRACE(
6236  10,
6237  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6238  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6239  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6240  return;
6241  }
6242  }
6243 
6244  /* now we can safely conduct the actual termination */
6245 
6246  __kmp_internal_end();
6247 
6248  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6249  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6250 
6251  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6252 
6253 #ifdef DUMP_DEBUG_ON_EXIT
6254  if (__kmp_debug_buf)
6255  __kmp_dump_debug_buffer();
6256 #endif
6257 } // __kmp_internal_end_thread
6258 
6259 // -----------------------------------------------------------------------------
6260 // Library registration stuff.
6261 
6262 static long __kmp_registration_flag = 0;
6263 // Random value used to indicate library initialization.
6264 static char *__kmp_registration_str = NULL;
6265 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6266 
6267 static inline char *__kmp_reg_status_name() {
6268  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6269  each thread. If registration and unregistration go in different threads
6270  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6271  env var can not be found, because the name will contain different pid. */
6272  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6273 } // __kmp_reg_status_get
6274 
6275 void __kmp_register_library_startup(void) {
6276 
6277  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6278  int done = 0;
6279  union {
6280  double dtime;
6281  long ltime;
6282  } time;
6283 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6284  __kmp_initialize_system_tick();
6285 #endif
6286  __kmp_read_system_time(&time.dtime);
6287  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6288  __kmp_registration_str =
6289  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6290  __kmp_registration_flag, KMP_LIBRARY_FILE);
6291 
6292  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6293  __kmp_registration_str));
6294 
6295  while (!done) {
6296 
6297  char *value = NULL; // Actual value of the environment variable.
6298 
6299  // Set environment variable, but do not overwrite if it is exist.
6300  __kmp_env_set(name, __kmp_registration_str, 0);
6301  // Check the variable is written.
6302  value = __kmp_env_get(name);
6303  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6304 
6305  done = 1; // Ok, environment variable set successfully, exit the loop.
6306 
6307  } else {
6308 
6309  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6310  // Check whether it alive or dead.
6311  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6312  char *tail = value;
6313  char *flag_addr_str = NULL;
6314  char *flag_val_str = NULL;
6315  char const *file_name = NULL;
6316  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6317  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6318  file_name = tail;
6319  if (tail != NULL) {
6320  long *flag_addr = 0;
6321  long flag_val = 0;
6322  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6323  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6324  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6325  // First, check whether environment-encoded address is mapped into
6326  // addr space.
6327  // If so, dereference it to see if it still has the right value.
6328  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6329  neighbor = 1;
6330  } else {
6331  // If not, then we know the other copy of the library is no longer
6332  // running.
6333  neighbor = 2;
6334  }
6335  }
6336  }
6337  switch (neighbor) {
6338  case 0: // Cannot parse environment variable -- neighbor status unknown.
6339  // Assume it is the incompatible format of future version of the
6340  // library. Assume the other library is alive.
6341  // WARN( ... ); // TODO: Issue a warning.
6342  file_name = "unknown library";
6343  // Attention! Falling to the next case. That's intentional.
6344  case 1: { // Neighbor is alive.
6345  // Check it is allowed.
6346  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6347  if (!__kmp_str_match_true(duplicate_ok)) {
6348  // That's not allowed. Issue fatal error.
6349  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6350  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6351  }
6352  KMP_INTERNAL_FREE(duplicate_ok);
6353  __kmp_duplicate_library_ok = 1;
6354  done = 1; // Exit the loop.
6355  } break;
6356  case 2: { // Neighbor is dead.
6357  // Clear the variable and try to register library again.
6358  __kmp_env_unset(name);
6359  } break;
6360  default: { KMP_DEBUG_ASSERT(0); } break;
6361  }
6362  }
6363  KMP_INTERNAL_FREE((void *)value);
6364  }
6365  KMP_INTERNAL_FREE((void *)name);
6366 
6367 } // func __kmp_register_library_startup
6368 
6369 void __kmp_unregister_library(void) {
6370 
6371  char *name = __kmp_reg_status_name();
6372  char *value = __kmp_env_get(name);
6373 
6374  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6375  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6376  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6377  // Ok, this is our variable. Delete it.
6378  __kmp_env_unset(name);
6379  }
6380 
6381  KMP_INTERNAL_FREE(__kmp_registration_str);
6382  KMP_INTERNAL_FREE(value);
6383  KMP_INTERNAL_FREE(name);
6384 
6385  __kmp_registration_flag = 0;
6386  __kmp_registration_str = NULL;
6387 
6388 } // __kmp_unregister_library
6389 
6390 // End of Library registration stuff.
6391 // -----------------------------------------------------------------------------
6392 
6393 #if KMP_MIC_SUPPORTED
6394 
6395 static void __kmp_check_mic_type() {
6396  kmp_cpuid_t cpuid_state = {0};
6397  kmp_cpuid_t *cs_p = &cpuid_state;
6398  __kmp_x86_cpuid(1, 0, cs_p);
6399  // We don't support mic1 at the moment
6400  if ((cs_p->eax & 0xff0) == 0xB10) {
6401  __kmp_mic_type = mic2;
6402  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6403  __kmp_mic_type = mic3;
6404  } else {
6405  __kmp_mic_type = non_mic;
6406  }
6407 }
6408 
6409 #endif /* KMP_MIC_SUPPORTED */
6410 
6411 static void __kmp_do_serial_initialize(void) {
6412  int i, gtid;
6413  int size;
6414 
6415  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6416 
6417  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6418  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6419  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6420  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6421  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6422 
6423 #if OMPT_SUPPORT
6424  ompt_pre_init();
6425 #endif
6426 
6427  __kmp_validate_locks();
6428 
6429  /* Initialize internal memory allocator */
6430  __kmp_init_allocator();
6431 
6432  /* Register the library startup via an environment variable and check to see
6433  whether another copy of the library is already registered. */
6434 
6435  __kmp_register_library_startup();
6436 
6437  /* TODO reinitialization of library */
6438  if (TCR_4(__kmp_global.g.g_done)) {
6439  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6440  }
6441 
6442  __kmp_global.g.g_abort = 0;
6443  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6444 
6445 /* initialize the locks */
6446 #if KMP_USE_ADAPTIVE_LOCKS
6447 #if KMP_DEBUG_ADAPTIVE_LOCKS
6448  __kmp_init_speculative_stats();
6449 #endif
6450 #endif
6451 #if KMP_STATS_ENABLED
6452  __kmp_stats_init();
6453 #endif
6454  __kmp_init_lock(&__kmp_global_lock);
6455  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6456  __kmp_init_lock(&__kmp_debug_lock);
6457  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6458  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6459  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6460  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6461  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6462  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6463  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6464  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6465  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6466  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6467  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6468  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6469  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6470  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6471  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6472 #if KMP_USE_MONITOR
6473  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6474 #endif
6475  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6476 
6477  /* conduct initialization and initial setup of configuration */
6478 
6479  __kmp_runtime_initialize();
6480 
6481 #if KMP_MIC_SUPPORTED
6482  __kmp_check_mic_type();
6483 #endif
6484 
6485 // Some global variable initialization moved here from kmp_env_initialize()
6486 #ifdef KMP_DEBUG
6487  kmp_diag = 0;
6488 #endif
6489  __kmp_abort_delay = 0;
6490 
6491  // From __kmp_init_dflt_team_nth()
6492  /* assume the entire machine will be used */
6493  __kmp_dflt_team_nth_ub = __kmp_xproc;
6494  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6495  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6496  }
6497  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6498  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6499  }
6500  __kmp_max_nth = __kmp_sys_max_nth;
6501  __kmp_cg_max_nth = __kmp_sys_max_nth;
6502  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6503  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6504  __kmp_teams_max_nth = __kmp_sys_max_nth;
6505  }
6506 
6507  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6508  // part
6509  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6510 #if KMP_USE_MONITOR
6511  __kmp_monitor_wakeups =
6512  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6513  __kmp_bt_intervals =
6514  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6515 #endif
6516  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6517  __kmp_library = library_throughput;
6518  // From KMP_SCHEDULE initialization
6519  __kmp_static = kmp_sch_static_balanced;
6520 // AC: do not use analytical here, because it is non-monotonous
6521 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6522 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6523 // need to repeat assignment
6524 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6525 // bit control and barrier method control parts
6526 #if KMP_FAST_REDUCTION_BARRIER
6527 #define kmp_reduction_barrier_gather_bb ((int)1)
6528 #define kmp_reduction_barrier_release_bb ((int)1)
6529 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6530 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6531 #endif // KMP_FAST_REDUCTION_BARRIER
6532  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6533  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6534  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6535  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6536  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6537 #if KMP_FAST_REDUCTION_BARRIER
6538  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6539  // lin_64 ): hyper,1
6540  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6541  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6542  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6543  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6544  }
6545 #endif // KMP_FAST_REDUCTION_BARRIER
6546  }
6547 #if KMP_FAST_REDUCTION_BARRIER
6548 #undef kmp_reduction_barrier_release_pat
6549 #undef kmp_reduction_barrier_gather_pat
6550 #undef kmp_reduction_barrier_release_bb
6551 #undef kmp_reduction_barrier_gather_bb
6552 #endif // KMP_FAST_REDUCTION_BARRIER
6553 #if KMP_MIC_SUPPORTED
6554  if (__kmp_mic_type == mic2) { // KNC
6555  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6556  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6557  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6558  1; // forkjoin release
6559  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6560  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6561  }
6562 #if KMP_FAST_REDUCTION_BARRIER
6563  if (__kmp_mic_type == mic2) { // KNC
6564  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6565  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6566  }
6567 #endif // KMP_FAST_REDUCTION_BARRIER
6568 #endif // KMP_MIC_SUPPORTED
6569 
6570 // From KMP_CHECKS initialization
6571 #ifdef KMP_DEBUG
6572  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6573 #else
6574  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6575 #endif
6576 
6577  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6578  __kmp_foreign_tp = TRUE;
6579 
6580  __kmp_global.g.g_dynamic = FALSE;
6581  __kmp_global.g.g_dynamic_mode = dynamic_default;
6582 
6583  __kmp_env_initialize(NULL);
6584 
6585 // Print all messages in message catalog for testing purposes.
6586 #ifdef KMP_DEBUG
6587  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6588  if (__kmp_str_match_true(val)) {
6589  kmp_str_buf_t buffer;
6590  __kmp_str_buf_init(&buffer);
6591  __kmp_i18n_dump_catalog(&buffer);
6592  __kmp_printf("%s", buffer.str);
6593  __kmp_str_buf_free(&buffer);
6594  }
6595  __kmp_env_free(&val);
6596 #endif
6597 
6598  __kmp_threads_capacity =
6599  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6600  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6601  __kmp_tp_capacity = __kmp_default_tp_capacity(
6602  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6603 
6604  // If the library is shut down properly, both pools must be NULL. Just in
6605  // case, set them to NULL -- some memory may leak, but subsequent code will
6606  // work even if pools are not freed.
6607  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6608  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6609  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6610  __kmp_thread_pool = NULL;
6611  __kmp_thread_pool_insert_pt = NULL;
6612  __kmp_team_pool = NULL;
6613 
6614  /* Allocate all of the variable sized records */
6615  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6616  * expandable */
6617  /* Since allocation is cache-aligned, just add extra padding at the end */
6618  size =
6619  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6620  CACHE_LINE;
6621  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6622  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6623  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6624 
6625  /* init thread counts */
6626  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6627  0); // Asserts fail if the library is reinitializing and
6628  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6629  __kmp_all_nth = 0;
6630  __kmp_nth = 0;
6631 
6632  /* setup the uber master thread and hierarchy */
6633  gtid = __kmp_register_root(TRUE);
6634  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6635  KMP_ASSERT(KMP_UBER_GTID(gtid));
6636  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6637 
6638  KMP_MB(); /* Flush all pending memory write invalidates. */
6639 
6640  __kmp_common_initialize();
6641 
6642 #if KMP_OS_UNIX
6643  /* invoke the child fork handler */
6644  __kmp_register_atfork();
6645 #endif
6646 
6647 #if !KMP_DYNAMIC_LIB
6648  {
6649  /* Invoke the exit handler when the program finishes, only for static
6650  library. For dynamic library, we already have _fini and DllMain. */
6651  int rc = atexit(__kmp_internal_end_atexit);
6652  if (rc != 0) {
6653  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6654  __kmp_msg_null);
6655  }
6656  }
6657 #endif
6658 
6659 #if KMP_HANDLE_SIGNALS
6660 #if KMP_OS_UNIX
6661  /* NOTE: make sure that this is called before the user installs their own
6662  signal handlers so that the user handlers are called first. this way they
6663  can return false, not call our handler, avoid terminating the library, and
6664  continue execution where they left off. */
6665  __kmp_install_signals(FALSE);
6666 #endif /* KMP_OS_UNIX */
6667 #if KMP_OS_WINDOWS
6668  __kmp_install_signals(TRUE);
6669 #endif /* KMP_OS_WINDOWS */
6670 #endif
6671 
6672  /* we have finished the serial initialization */
6673  __kmp_init_counter++;
6674 
6675  __kmp_init_serial = TRUE;
6676 
6677  if (__kmp_settings) {
6678  __kmp_env_print();
6679  }
6680 
6681 #if OMP_40_ENABLED
6682  if (__kmp_display_env || __kmp_display_env_verbose) {
6683  __kmp_env_print_2();
6684  }
6685 #endif // OMP_40_ENABLED
6686 
6687 #if OMPT_SUPPORT
6688  ompt_post_init();
6689 #endif
6690 
6691  KMP_MB();
6692 
6693  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6694 }
6695 
6696 void __kmp_serial_initialize(void) {
6697  if (__kmp_init_serial) {
6698  return;
6699  }
6700  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6701  if (__kmp_init_serial) {
6702  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6703  return;
6704  }
6705  __kmp_do_serial_initialize();
6706  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6707 }
6708 
6709 static void __kmp_do_middle_initialize(void) {
6710  int i, j;
6711  int prev_dflt_team_nth;
6712 
6713  if (!__kmp_init_serial) {
6714  __kmp_do_serial_initialize();
6715  }
6716 
6717  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6718 
6719  // Save the previous value for the __kmp_dflt_team_nth so that
6720  // we can avoid some reinitialization if it hasn't changed.
6721  prev_dflt_team_nth = __kmp_dflt_team_nth;
6722 
6723 #if KMP_AFFINITY_SUPPORTED
6724  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6725  // number of cores on the machine.
6726  __kmp_affinity_initialize();
6727 
6728  // Run through the __kmp_threads array and set the affinity mask
6729  // for each root thread that is currently registered with the RTL.
6730  for (i = 0; i < __kmp_threads_capacity; i++) {
6731  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6732  __kmp_affinity_set_init_mask(i, TRUE);
6733  }
6734  }
6735 #endif /* KMP_AFFINITY_SUPPORTED */
6736 
6737  KMP_ASSERT(__kmp_xproc > 0);
6738  if (__kmp_avail_proc == 0) {
6739  __kmp_avail_proc = __kmp_xproc;
6740  }
6741 
6742  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6743  // correct them now
6744  j = 0;
6745  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6746  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6747  __kmp_avail_proc;
6748  j++;
6749  }
6750 
6751  if (__kmp_dflt_team_nth == 0) {
6752 #ifdef KMP_DFLT_NTH_CORES
6753  // Default #threads = #cores
6754  __kmp_dflt_team_nth = __kmp_ncores;
6755  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6756  "__kmp_ncores (%d)\n",
6757  __kmp_dflt_team_nth));
6758 #else
6759  // Default #threads = #available OS procs
6760  __kmp_dflt_team_nth = __kmp_avail_proc;
6761  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6762  "__kmp_avail_proc(%d)\n",
6763  __kmp_dflt_team_nth));
6764 #endif /* KMP_DFLT_NTH_CORES */
6765  }
6766 
6767  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6768  __kmp_dflt_team_nth = KMP_MIN_NTH;
6769  }
6770  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6771  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6772  }
6773 
6774  // There's no harm in continuing if the following check fails,
6775  // but it indicates an error in the previous logic.
6776  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6777 
6778  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6779  // Run through the __kmp_threads array and set the num threads icv for each
6780  // root thread that is currently registered with the RTL (which has not
6781  // already explicitly set its nthreads-var with a call to
6782  // omp_set_num_threads()).
6783  for (i = 0; i < __kmp_threads_capacity; i++) {
6784  kmp_info_t *thread = __kmp_threads[i];
6785  if (thread == NULL)
6786  continue;
6787  if (thread->th.th_current_task->td_icvs.nproc != 0)
6788  continue;
6789 
6790  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6791  }
6792  }
6793  KA_TRACE(
6794  20,
6795  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6796  __kmp_dflt_team_nth));
6797 
6798 #ifdef KMP_ADJUST_BLOCKTIME
6799  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6800  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6801  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6802  if (__kmp_nth > __kmp_avail_proc) {
6803  __kmp_zero_bt = TRUE;
6804  }
6805  }
6806 #endif /* KMP_ADJUST_BLOCKTIME */
6807 
6808  /* we have finished middle initialization */
6809  TCW_SYNC_4(__kmp_init_middle, TRUE);
6810 
6811  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6812 }
6813 
6814 void __kmp_middle_initialize(void) {
6815  if (__kmp_init_middle) {
6816  return;
6817  }
6818  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6819  if (__kmp_init_middle) {
6820  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6821  return;
6822  }
6823  __kmp_do_middle_initialize();
6824  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6825 }
6826 
6827 void __kmp_parallel_initialize(void) {
6828  int gtid = __kmp_entry_gtid(); // this might be a new root
6829 
6830  /* synchronize parallel initialization (for sibling) */
6831  if (TCR_4(__kmp_init_parallel))
6832  return;
6833  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6834  if (TCR_4(__kmp_init_parallel)) {
6835  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6836  return;
6837  }
6838 
6839  /* TODO reinitialization after we have already shut down */
6840  if (TCR_4(__kmp_global.g.g_done)) {
6841  KA_TRACE(
6842  10,
6843  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6844  __kmp_infinite_loop();
6845  }
6846 
6847  /* jc: The lock __kmp_initz_lock is already held, so calling
6848  __kmp_serial_initialize would cause a deadlock. So we call
6849  __kmp_do_serial_initialize directly. */
6850  if (!__kmp_init_middle) {
6851  __kmp_do_middle_initialize();
6852  }
6853 
6854  /* begin initialization */
6855  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6856  KMP_ASSERT(KMP_UBER_GTID(gtid));
6857 
6858 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6859  // Save the FP control regs.
6860  // Worker threads will set theirs to these values at thread startup.
6861  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6862  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6863  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6864 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6865 
6866 #if KMP_OS_UNIX
6867 #if KMP_HANDLE_SIGNALS
6868  /* must be after __kmp_serial_initialize */
6869  __kmp_install_signals(TRUE);
6870 #endif
6871 #endif
6872 
6873  __kmp_suspend_initialize();
6874 
6875 #if defined(USE_LOAD_BALANCE)
6876  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6877  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6878  }
6879 #else
6880  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6881  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6882  }
6883 #endif
6884 
6885  if (__kmp_version) {
6886  __kmp_print_version_2();
6887  }
6888 
6889  /* we have finished parallel initialization */
6890  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6891 
6892  KMP_MB();
6893  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6894 
6895  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6896 }
6897 
6898 /* ------------------------------------------------------------------------ */
6899 
6900 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6901  kmp_team_t *team) {
6902  kmp_disp_t *dispatch;
6903 
6904  KMP_MB();
6905 
6906  /* none of the threads have encountered any constructs, yet. */
6907  this_thr->th.th_local.this_construct = 0;
6908 #if KMP_CACHE_MANAGE
6909  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6910 #endif /* KMP_CACHE_MANAGE */
6911  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6912  KMP_DEBUG_ASSERT(dispatch);
6913  KMP_DEBUG_ASSERT(team->t.t_dispatch);
6914  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6915  // this_thr->th.th_info.ds.ds_tid ] );
6916 
6917  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6918 #if OMP_45_ENABLED
6919  dispatch->th_doacross_buf_idx =
6920  0; /* reset the doacross dispatch buffer counter */
6921 #endif
6922  if (__kmp_env_consistency_check)
6923  __kmp_push_parallel(gtid, team->t.t_ident);
6924 
6925  KMP_MB(); /* Flush all pending memory write invalidates. */
6926 }
6927 
6928 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6929  kmp_team_t *team) {
6930  if (__kmp_env_consistency_check)
6931  __kmp_pop_parallel(gtid, team->t.t_ident);
6932 
6933  __kmp_finish_implicit_task(this_thr);
6934 }
6935 
6936 int __kmp_invoke_task_func(int gtid) {
6937  int rc;
6938  int tid = __kmp_tid_from_gtid(gtid);
6939  kmp_info_t *this_thr = __kmp_threads[gtid];
6940  kmp_team_t *team = this_thr->th.th_team;
6941 
6942  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6943 #if USE_ITT_BUILD
6944  if (__itt_stack_caller_create_ptr) {
6945  __kmp_itt_stack_callee_enter(
6946  (__itt_caller)
6947  team->t.t_stack_id); // inform ittnotify about entering user's code
6948  }
6949 #endif /* USE_ITT_BUILD */
6950 #if INCLUDE_SSC_MARKS
6951  SSC_MARK_INVOKING();
6952 #endif
6953 
6954 #if OMPT_SUPPORT
6955  void *dummy;
6956  void **exit_runtime_p;
6957  ompt_data_t *my_task_data;
6958  ompt_data_t *my_parallel_data;
6959  int ompt_team_size;
6960 
6961  if (ompt_enabled.enabled) {
6962  exit_runtime_p = &(
6963  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame);
6964  } else {
6965  exit_runtime_p = &dummy;
6966  }
6967 
6968  my_task_data =
6969  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6970  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6971  if (ompt_enabled.ompt_callback_implicit_task) {
6972  ompt_team_size = team->t.t_nproc;
6973  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6974  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6975  __kmp_tid_from_gtid(gtid));
6976  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
6977  }
6978 #endif
6979 
6980  {
6981  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6982  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6983  rc =
6984  __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6985  tid, (int)team->t.t_argc, (void **)team->t.t_argv
6986 #if OMPT_SUPPORT
6987  ,
6988  exit_runtime_p
6989 #endif
6990  );
6991 #if OMPT_SUPPORT
6992  *exit_runtime_p = NULL;
6993 #endif
6994  }
6995 
6996 #if USE_ITT_BUILD
6997  if (__itt_stack_caller_create_ptr) {
6998  __kmp_itt_stack_callee_leave(
6999  (__itt_caller)
7000  team->t.t_stack_id); // inform ittnotify about leaving user's code
7001  }
7002 #endif /* USE_ITT_BUILD */
7003  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7004 
7005  return rc;
7006 }
7007 
7008 #if OMP_40_ENABLED
7009 void __kmp_teams_master(int gtid) {
7010  // This routine is called by all master threads in teams construct
7011  kmp_info_t *thr = __kmp_threads[gtid];
7012  kmp_team_t *team = thr->th.th_team;
7013  ident_t *loc = team->t.t_ident;
7014  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7015  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7016  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7017  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7018  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7019 // Launch league of teams now, but not let workers execute
7020 // (they hang on fork barrier until next parallel)
7021 #if INCLUDE_SSC_MARKS
7022  SSC_MARK_FORKING();
7023 #endif
7024  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7025  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7026  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7027 #if INCLUDE_SSC_MARKS
7028  SSC_MARK_JOINING();
7029 #endif
7030 
7031  // AC: last parameter "1" eliminates join barrier which won't work because
7032  // worker threads are in a fork barrier waiting for more parallel regions
7033  __kmp_join_call(loc, gtid
7034 #if OMPT_SUPPORT
7035  ,
7036  fork_context_intel
7037 #endif
7038  ,
7039  1);
7040 }
7041 
7042 int __kmp_invoke_teams_master(int gtid) {
7043  kmp_info_t *this_thr = __kmp_threads[gtid];
7044  kmp_team_t *team = this_thr->th.th_team;
7045 #if KMP_DEBUG
7046  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7047  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7048  (void *)__kmp_teams_master);
7049 #endif
7050  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7051  __kmp_teams_master(gtid);
7052  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7053  return 1;
7054 }
7055 #endif /* OMP_40_ENABLED */
7056 
7057 /* this sets the requested number of threads for the next parallel region
7058  encountered by this team. since this should be enclosed in the forkjoin
7059  critical section it should avoid race conditions with assymmetrical nested
7060  parallelism */
7061 
7062 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7063  kmp_info_t *thr = __kmp_threads[gtid];
7064 
7065  if (num_threads > 0)
7066  thr->th.th_set_nproc = num_threads;
7067 }
7068 
7069 #if OMP_40_ENABLED
7070 
7071 /* this sets the requested number of teams for the teams region and/or
7072  the number of threads for the next parallel region encountered */
7073 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7074  int num_threads) {
7075  kmp_info_t *thr = __kmp_threads[gtid];
7076  KMP_DEBUG_ASSERT(num_teams >= 0);
7077  KMP_DEBUG_ASSERT(num_threads >= 0);
7078 
7079  if (num_teams == 0)
7080  num_teams = 1; // default number of teams is 1.
7081  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7082  if (!__kmp_reserve_warn) {
7083  __kmp_reserve_warn = 1;
7084  __kmp_msg(kmp_ms_warning,
7085  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7086  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7087  }
7088  num_teams = __kmp_teams_max_nth;
7089  }
7090  // Set number of teams (number of threads in the outer "parallel" of the
7091  // teams)
7092  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7093 
7094  // Remember the number of threads for inner parallel regions
7095  if (num_threads == 0) {
7096  if (!TCR_4(__kmp_init_middle))
7097  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7098  num_threads = __kmp_avail_proc / num_teams;
7099  if (num_teams * num_threads > __kmp_teams_max_nth) {
7100  // adjust num_threads w/o warning as it is not user setting
7101  num_threads = __kmp_teams_max_nth / num_teams;
7102  }
7103  } else {
7104  if (num_teams * num_threads > __kmp_teams_max_nth) {
7105  int new_threads = __kmp_teams_max_nth / num_teams;
7106  if (!__kmp_reserve_warn) { // user asked for too many threads
7107  __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7108  __kmp_msg(kmp_ms_warning,
7109  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7110  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7111  }
7112  num_threads = new_threads;
7113  }
7114  }
7115  thr->th.th_teams_size.nth = num_threads;
7116 }
7117 
7118 // Set the proc_bind var to use in the following parallel region.
7119 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7120  kmp_info_t *thr = __kmp_threads[gtid];
7121  thr->th.th_set_proc_bind = proc_bind;
7122 }
7123 
7124 #endif /* OMP_40_ENABLED */
7125 
7126 /* Launch the worker threads into the microtask. */
7127 
7128 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7129  kmp_info_t *this_thr = __kmp_threads[gtid];
7130 
7131 #ifdef KMP_DEBUG
7132  int f;
7133 #endif /* KMP_DEBUG */
7134 
7135  KMP_DEBUG_ASSERT(team);
7136  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7137  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7138  KMP_MB(); /* Flush all pending memory write invalidates. */
7139 
7140  team->t.t_construct = 0; /* no single directives seen yet */
7141  team->t.t_ordered.dt.t_value =
7142  0; /* thread 0 enters the ordered section first */
7143 
7144  /* Reset the identifiers on the dispatch buffer */
7145  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7146  if (team->t.t_max_nproc > 1) {
7147  int i;
7148  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7149  team->t.t_disp_buffer[i].buffer_index = i;
7150 #if OMP_45_ENABLED
7151  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7152 #endif
7153  }
7154  } else {
7155  team->t.t_disp_buffer[0].buffer_index = 0;
7156 #if OMP_45_ENABLED
7157  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7158 #endif
7159  }
7160 
7161  KMP_MB(); /* Flush all pending memory write invalidates. */
7162  KMP_ASSERT(this_thr->th.th_team == team);
7163 
7164 #ifdef KMP_DEBUG
7165  for (f = 0; f < team->t.t_nproc; f++) {
7166  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7167  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7168  }
7169 #endif /* KMP_DEBUG */
7170 
7171  /* release the worker threads so they may begin working */
7172  __kmp_fork_barrier(gtid, 0);
7173 }
7174 
7175 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7176  kmp_info_t *this_thr = __kmp_threads[gtid];
7177 
7178  KMP_DEBUG_ASSERT(team);
7179  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7180  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7181  KMP_MB(); /* Flush all pending memory write invalidates. */
7182 
7183 /* Join barrier after fork */
7184 
7185 #ifdef KMP_DEBUG
7186  if (__kmp_threads[gtid] &&
7187  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7188  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7189  __kmp_threads[gtid]);
7190  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7191  "team->t.t_nproc=%d\n",
7192  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7193  team->t.t_nproc);
7194  __kmp_print_structure();
7195  }
7196  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7197  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7198 #endif /* KMP_DEBUG */
7199 
7200  __kmp_join_barrier(gtid); /* wait for everyone */
7201 #if OMPT_SUPPORT
7202  if (ompt_enabled.enabled &&
7203  this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7204  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7205  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7206  this_thr->th.ompt_thread_info.state = omp_state_overhead;
7207 #if OMPT_OPTIONAL
7208  void *codeptr = NULL;
7209  if (KMP_MASTER_TID(ds_tid) &&
7210  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7211  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7212  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7213 
7214  if (ompt_enabled.ompt_callback_sync_region_wait) {
7215  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7216  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7217  }
7218  if (ompt_enabled.ompt_callback_sync_region) {
7219  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7220  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7221  }
7222 #endif
7223  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7224  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7225  ompt_scope_end, NULL, task_data, 0, ds_tid);
7226  }
7227  }
7228 #endif
7229 
7230  KMP_MB(); /* Flush all pending memory write invalidates. */
7231  KMP_ASSERT(this_thr->th.th_team == team);
7232 }
7233 
7234 /* ------------------------------------------------------------------------ */
7235 
7236 #ifdef USE_LOAD_BALANCE
7237 
7238 // Return the worker threads actively spinning in the hot team, if we
7239 // are at the outermost level of parallelism. Otherwise, return 0.
7240 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7241  int i;
7242  int retval;
7243  kmp_team_t *hot_team;
7244 
7245  if (root->r.r_active) {
7246  return 0;
7247  }
7248  hot_team = root->r.r_hot_team;
7249  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7250  return hot_team->t.t_nproc - 1; // Don't count master thread
7251  }
7252 
7253  // Skip the master thread - it is accounted for elsewhere.
7254  retval = 0;
7255  for (i = 1; i < hot_team->t.t_nproc; i++) {
7256  if (hot_team->t.t_threads[i]->th.th_active) {
7257  retval++;
7258  }
7259  }
7260  return retval;
7261 }
7262 
7263 // Perform an automatic adjustment to the number of
7264 // threads used by the next parallel region.
7265 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7266  int retval;
7267  int pool_active;
7268  int hot_team_active;
7269  int team_curr_active;
7270  int system_active;
7271 
7272  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7273  set_nproc));
7274  KMP_DEBUG_ASSERT(root);
7275  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7276  ->th.th_current_task->td_icvs.dynamic == TRUE);
7277  KMP_DEBUG_ASSERT(set_nproc > 1);
7278 
7279  if (set_nproc == 1) {
7280  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7281  return 1;
7282  }
7283 
7284  // Threads that are active in the thread pool, active in the hot team for this
7285  // particular root (if we are at the outer par level), and the currently
7286  // executing thread (to become the master) are available to add to the new
7287  // team, but are currently contributing to the system load, and must be
7288  // accounted for.
7289  pool_active = __kmp_thread_pool_active_nth;
7290  hot_team_active = __kmp_active_hot_team_nproc(root);
7291  team_curr_active = pool_active + hot_team_active + 1;
7292 
7293  // Check the system load.
7294  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7295  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7296  "hot team active = %d\n",
7297  system_active, pool_active, hot_team_active));
7298 
7299  if (system_active < 0) {
7300  // There was an error reading the necessary info from /proc, so use the
7301  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7302  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7303  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7304  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7305 
7306  // Make this call behave like the thread limit algorithm.
7307  retval = __kmp_avail_proc - __kmp_nth +
7308  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7309  if (retval > set_nproc) {
7310  retval = set_nproc;
7311  }
7312  if (retval < KMP_MIN_NTH) {
7313  retval = KMP_MIN_NTH;
7314  }
7315 
7316  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7317  retval));
7318  return retval;
7319  }
7320 
7321  // There is a slight delay in the load balance algorithm in detecting new
7322  // running procs. The real system load at this instant should be at least as
7323  // large as the #active omp thread that are available to add to the team.
7324  if (system_active < team_curr_active) {
7325  system_active = team_curr_active;
7326  }
7327  retval = __kmp_avail_proc - system_active + team_curr_active;
7328  if (retval > set_nproc) {
7329  retval = set_nproc;
7330  }
7331  if (retval < KMP_MIN_NTH) {
7332  retval = KMP_MIN_NTH;
7333  }
7334 
7335  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7336  return retval;
7337 } // __kmp_load_balance_nproc()
7338 
7339 #endif /* USE_LOAD_BALANCE */
7340 
7341 /* ------------------------------------------------------------------------ */
7342 
7343 /* NOTE: this is called with the __kmp_init_lock held */
7344 void __kmp_cleanup(void) {
7345  int f;
7346 
7347  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7348 
7349  if (TCR_4(__kmp_init_parallel)) {
7350 #if KMP_HANDLE_SIGNALS
7351  __kmp_remove_signals();
7352 #endif
7353  TCW_4(__kmp_init_parallel, FALSE);
7354  }
7355 
7356  if (TCR_4(__kmp_init_middle)) {
7357 #if KMP_AFFINITY_SUPPORTED
7358  __kmp_affinity_uninitialize();
7359 #endif /* KMP_AFFINITY_SUPPORTED */
7360  __kmp_cleanup_hierarchy();
7361  TCW_4(__kmp_init_middle, FALSE);
7362  }
7363 
7364  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7365 
7366  if (__kmp_init_serial) {
7367  __kmp_runtime_destroy();
7368  __kmp_init_serial = FALSE;
7369  }
7370 
7371  __kmp_cleanup_threadprivate_caches();
7372 
7373  for (f = 0; f < __kmp_threads_capacity; f++) {
7374  if (__kmp_root[f] != NULL) {
7375  __kmp_free(__kmp_root[f]);
7376  __kmp_root[f] = NULL;
7377  }
7378  }
7379  __kmp_free(__kmp_threads);
7380  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7381  // there is no need in freeing __kmp_root.
7382  __kmp_threads = NULL;
7383  __kmp_root = NULL;
7384  __kmp_threads_capacity = 0;
7385 
7386 #if KMP_USE_DYNAMIC_LOCK
7387  __kmp_cleanup_indirect_user_locks();
7388 #else
7389  __kmp_cleanup_user_locks();
7390 #endif
7391 
7392 #if KMP_AFFINITY_SUPPORTED
7393  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7394  __kmp_cpuinfo_file = NULL;
7395 #endif /* KMP_AFFINITY_SUPPORTED */
7396 
7397 #if KMP_USE_ADAPTIVE_LOCKS
7398 #if KMP_DEBUG_ADAPTIVE_LOCKS
7399  __kmp_print_speculative_stats();
7400 #endif
7401 #endif
7402  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7403  __kmp_nested_nth.nth = NULL;
7404  __kmp_nested_nth.size = 0;
7405  __kmp_nested_nth.used = 0;
7406  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7407  __kmp_nested_proc_bind.bind_types = NULL;
7408  __kmp_nested_proc_bind.size = 0;
7409  __kmp_nested_proc_bind.used = 0;
7410 
7411  __kmp_i18n_catclose();
7412 
7413 #if KMP_USE_HIER_SCHED
7414  __kmp_hier_scheds.deallocate();
7415 #endif
7416 
7417 #if KMP_STATS_ENABLED
7418  __kmp_stats_fini();
7419 #endif
7420 
7421  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7422 }
7423 
7424 /* ------------------------------------------------------------------------ */
7425 
7426 int __kmp_ignore_mppbeg(void) {
7427  char *env;
7428 
7429  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7430  if (__kmp_str_match_false(env))
7431  return FALSE;
7432  }
7433  // By default __kmpc_begin() is no-op.
7434  return TRUE;
7435 }
7436 
7437 int __kmp_ignore_mppend(void) {
7438  char *env;
7439 
7440  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7441  if (__kmp_str_match_false(env))
7442  return FALSE;
7443  }
7444  // By default __kmpc_end() is no-op.
7445  return TRUE;
7446 }
7447 
7448 void __kmp_internal_begin(void) {
7449  int gtid;
7450  kmp_root_t *root;
7451 
7452  /* this is a very important step as it will register new sibling threads
7453  and assign these new uber threads a new gtid */
7454  gtid = __kmp_entry_gtid();
7455  root = __kmp_threads[gtid]->th.th_root;
7456  KMP_ASSERT(KMP_UBER_GTID(gtid));
7457 
7458  if (root->r.r_begin)
7459  return;
7460  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7461  if (root->r.r_begin) {
7462  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7463  return;
7464  }
7465 
7466  root->r.r_begin = TRUE;
7467 
7468  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7469 }
7470 
7471 /* ------------------------------------------------------------------------ */
7472 
7473 void __kmp_user_set_library(enum library_type arg) {
7474  int gtid;
7475  kmp_root_t *root;
7476  kmp_info_t *thread;
7477 
7478  /* first, make sure we are initialized so we can get our gtid */
7479 
7480  gtid = __kmp_entry_gtid();
7481  thread = __kmp_threads[gtid];
7482 
7483  root = thread->th.th_root;
7484 
7485  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7486  library_serial));
7487  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7488  thread */
7489  KMP_WARNING(SetLibraryIncorrectCall);
7490  return;
7491  }
7492 
7493  switch (arg) {
7494  case library_serial:
7495  thread->th.th_set_nproc = 0;
7496  set__nproc(thread, 1);
7497  break;
7498  case library_turnaround:
7499  thread->th.th_set_nproc = 0;
7500  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7501  : __kmp_dflt_team_nth_ub);
7502  break;
7503  case library_throughput:
7504  thread->th.th_set_nproc = 0;
7505  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7506  : __kmp_dflt_team_nth_ub);
7507  break;
7508  default:
7509  KMP_FATAL(UnknownLibraryType, arg);
7510  }
7511 
7512  __kmp_aux_set_library(arg);
7513 }
7514 
7515 void __kmp_aux_set_stacksize(size_t arg) {
7516  if (!__kmp_init_serial)
7517  __kmp_serial_initialize();
7518 
7519 #if KMP_OS_DARWIN
7520  if (arg & (0x1000 - 1)) {
7521  arg &= ~(0x1000 - 1);
7522  if (arg + 0x1000) /* check for overflow if we round up */
7523  arg += 0x1000;
7524  }
7525 #endif
7526  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7527 
7528  /* only change the default stacksize before the first parallel region */
7529  if (!TCR_4(__kmp_init_parallel)) {
7530  size_t value = arg; /* argument is in bytes */
7531 
7532  if (value < __kmp_sys_min_stksize)
7533  value = __kmp_sys_min_stksize;
7534  else if (value > KMP_MAX_STKSIZE)
7535  value = KMP_MAX_STKSIZE;
7536 
7537  __kmp_stksize = value;
7538 
7539  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7540  }
7541 
7542  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7543 }
7544 
7545 /* set the behaviour of the runtime library */
7546 /* TODO this can cause some odd behaviour with sibling parallelism... */
7547 void __kmp_aux_set_library(enum library_type arg) {
7548  __kmp_library = arg;
7549 
7550  switch (__kmp_library) {
7551  case library_serial: {
7552  KMP_INFORM(LibraryIsSerial);
7553  (void)__kmp_change_library(TRUE);
7554  } break;
7555  case library_turnaround:
7556  (void)__kmp_change_library(TRUE);
7557  break;
7558  case library_throughput:
7559  (void)__kmp_change_library(FALSE);
7560  break;
7561  default:
7562  KMP_FATAL(UnknownLibraryType, arg);
7563  }
7564 }
7565 
7566 /* ------------------------------------------------------------------------ */
7567 
7568 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7569  int blocktime = arg; /* argument is in milliseconds */
7570 #if KMP_USE_MONITOR
7571  int bt_intervals;
7572 #endif
7573  int bt_set;
7574 
7575  __kmp_save_internal_controls(thread);
7576 
7577  /* Normalize and set blocktime for the teams */
7578  if (blocktime < KMP_MIN_BLOCKTIME)
7579  blocktime = KMP_MIN_BLOCKTIME;
7580  else if (blocktime > KMP_MAX_BLOCKTIME)
7581  blocktime = KMP_MAX_BLOCKTIME;
7582 
7583  set__blocktime_team(thread->th.th_team, tid, blocktime);
7584  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7585 
7586 #if KMP_USE_MONITOR
7587  /* Calculate and set blocktime intervals for the teams */
7588  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7589 
7590  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7591  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7592 #endif
7593 
7594  /* Set whether blocktime has been set to "TRUE" */
7595  bt_set = TRUE;
7596 
7597  set__bt_set_team(thread->th.th_team, tid, bt_set);
7598  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7599 #if KMP_USE_MONITOR
7600  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7601  "bt_intervals=%d, monitor_updates=%d\n",
7602  __kmp_gtid_from_tid(tid, thread->th.th_team),
7603  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7604  __kmp_monitor_wakeups));
7605 #else
7606  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7607  __kmp_gtid_from_tid(tid, thread->th.th_team),
7608  thread->th.th_team->t.t_id, tid, blocktime));
7609 #endif
7610 }
7611 
7612 void __kmp_aux_set_defaults(char const *str, int len) {
7613  if (!__kmp_init_serial) {
7614  __kmp_serial_initialize();
7615  }
7616  __kmp_env_initialize(str);
7617 
7618  if (__kmp_settings
7619 #if OMP_40_ENABLED
7620  || __kmp_display_env || __kmp_display_env_verbose
7621 #endif // OMP_40_ENABLED
7622  ) {
7623  __kmp_env_print();
7624  }
7625 } // __kmp_aux_set_defaults
7626 
7627 /* ------------------------------------------------------------------------ */
7628 /* internal fast reduction routines */
7629 
7630 PACKED_REDUCTION_METHOD_T
7631 __kmp_determine_reduction_method(
7632  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7633  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7634  kmp_critical_name *lck) {
7635 
7636  // Default reduction method: critical construct ( lck != NULL, like in current
7637  // PAROPT )
7638  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7639  // can be selected by RTL
7640  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7641  // can be selected by RTL
7642  // Finally, it's up to OpenMP RTL to make a decision on which method to select
7643  // among generated by PAROPT.
7644 
7645  PACKED_REDUCTION_METHOD_T retval;
7646 
7647  int team_size;
7648 
7649  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7650  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7651 
7652 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
7653  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7654 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7655 
7656  retval = critical_reduce_block;
7657 
7658  // another choice of getting a team size (with 1 dynamic deference) is slower
7659  team_size = __kmp_get_team_num_threads(global_tid);
7660  if (team_size == 1) {
7661 
7662  retval = empty_reduce_block;
7663 
7664  } else {
7665 
7666  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7667 
7668 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7669 
7670 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || \
7671  KMP_OS_DARWIN || KMP_OS_HURD
7672 
7673  int teamsize_cutoff = 4;
7674 
7675 #if KMP_MIC_SUPPORTED
7676  if (__kmp_mic_type != non_mic) {
7677  teamsize_cutoff = 8;
7678  }
7679 #endif
7680  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7681  if (tree_available) {
7682  if (team_size <= teamsize_cutoff) {
7683  if (atomic_available) {
7684  retval = atomic_reduce_block;
7685  }
7686  } else {
7687  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7688  }
7689  } else if (atomic_available) {
7690  retval = atomic_reduce_block;
7691  }
7692 #else
7693 #error "Unknown or unsupported OS"
7694 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7695 // KMP_OS_DARWIN
7696 
7697 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7698 
7699 #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD
7700 
7701  // basic tuning
7702 
7703  if (atomic_available) {
7704  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7705  retval = atomic_reduce_block;
7706  }
7707  } // otherwise: use critical section
7708 
7709 #elif KMP_OS_DARWIN
7710 
7711  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7712  if (atomic_available && (num_vars <= 3)) {
7713  retval = atomic_reduce_block;
7714  } else if (tree_available) {
7715  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7716  (reduce_size < (2000 * sizeof(kmp_real64)))) {
7717  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7718  }
7719  } // otherwise: use critical section
7720 
7721 #else
7722 #error "Unknown or unsupported OS"
7723 #endif
7724 
7725 #else
7726 #error "Unknown or unsupported architecture"
7727 #endif
7728  }
7729 
7730  // KMP_FORCE_REDUCTION
7731 
7732  // If the team is serialized (team_size == 1), ignore the forced reduction
7733  // method and stay with the unsynchronized method (empty_reduce_block)
7734  if (__kmp_force_reduction_method != reduction_method_not_defined &&
7735  team_size != 1) {
7736 
7737  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7738 
7739  int atomic_available, tree_available;
7740 
7741  switch ((forced_retval = __kmp_force_reduction_method)) {
7742  case critical_reduce_block:
7743  KMP_ASSERT(lck); // lck should be != 0
7744  break;
7745 
7746  case atomic_reduce_block:
7747  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7748  if (!atomic_available) {
7749  KMP_WARNING(RedMethodNotSupported, "atomic");
7750  forced_retval = critical_reduce_block;
7751  }
7752  break;
7753 
7754  case tree_reduce_block:
7755  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7756  if (!tree_available) {
7757  KMP_WARNING(RedMethodNotSupported, "tree");
7758  forced_retval = critical_reduce_block;
7759  } else {
7760 #if KMP_FAST_REDUCTION_BARRIER
7761  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7762 #endif
7763  }
7764  break;
7765 
7766  default:
7767  KMP_ASSERT(0); // "unsupported method specified"
7768  }
7769 
7770  retval = forced_retval;
7771  }
7772 
7773  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7774 
7775 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7776 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7777 
7778  return (retval);
7779 }
7780 
7781 // this function is for testing set/get/determine reduce method
7782 kmp_int32 __kmp_get_reduce_method(void) {
7783  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7784 }
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:877
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:919
sched_type
Definition: kmp.h:332
Definition: kmp.h:219
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
kmp_int32 flags
Definition: kmp.h:221