LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 #include "kmp_dispatch.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 
36 /* these are temporary issues to be dealt with */
37 #define KMP_USE_PRCTL 0
38 
39 #if KMP_OS_WINDOWS
40 #include <process.h>
41 #endif
42 
43 #include "tsan_annotations.h"
44 
45 #if defined(KMP_GOMP_COMPAT)
46 char const __kmp_version_alt_comp[] =
47  KMP_VERSION_PREFIX "alternative compiler support: yes";
48 #endif /* defined(KMP_GOMP_COMPAT) */
49 
50 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
51 #if OMP_50_ENABLED
52  "5.0 (201611)";
53 #elif OMP_45_ENABLED
54  "4.5 (201511)";
55 #elif OMP_40_ENABLED
56  "4.0 (201307)";
57 #else
58  "3.1 (201107)";
59 #endif
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63  KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79  int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81  kmp_internal_control_t *new_icvs,
82  ident_t *loc);
83 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85  int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91  kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
102 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
103 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
104 
105 /* Calculate the identifier of the current thread */
106 /* fast (and somewhat portable) way to get unique identifier of executing
107  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
108 int __kmp_get_global_thread_id() {
109  int i;
110  kmp_info_t **other_threads;
111  size_t stack_data;
112  char *stack_addr;
113  size_t stack_size;
114  char *stack_base;
115 
116  KA_TRACE(
117  1000,
118  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
119  __kmp_nth, __kmp_all_nth));
120 
121  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
122  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
123  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
124  __kmp_init_gtid for this to work. */
125 
126  if (!TCR_4(__kmp_init_gtid))
127  return KMP_GTID_DNE;
128 
129 #ifdef KMP_TDATA_GTID
130  if (TCR_4(__kmp_gtid_mode) >= 3) {
131  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
132  return __kmp_gtid;
133  }
134 #endif
135  if (TCR_4(__kmp_gtid_mode) >= 2) {
136  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
137  return __kmp_gtid_get_specific();
138  }
139  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
140 
141  stack_addr = (char *)&stack_data;
142  other_threads = __kmp_threads;
143 
144  /* ATT: The code below is a source of potential bugs due to unsynchronized
145  access to __kmp_threads array. For example:
146  1. Current thread loads other_threads[i] to thr and checks it, it is
147  non-NULL.
148  2. Current thread is suspended by OS.
149  3. Another thread unregisters and finishes (debug versions of free()
150  may fill memory with something like 0xEF).
151  4. Current thread is resumed.
152  5. Current thread reads junk from *thr.
153  TODO: Fix it. --ln */
154 
155  for (i = 0; i < __kmp_threads_capacity; i++) {
156 
157  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
158  if (!thr)
159  continue;
160 
161  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
162  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
163 
164  /* stack grows down -- search through all of the active threads */
165 
166  if (stack_addr <= stack_base) {
167  size_t stack_diff = stack_base - stack_addr;
168 
169  if (stack_diff <= stack_size) {
170  /* The only way we can be closer than the allocated */
171  /* stack size is if we are running on this thread. */
172  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
173  return i;
174  }
175  }
176  }
177 
178  /* get specific to try and determine our gtid */
179  KA_TRACE(1000,
180  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
181  "thread, using TLS\n"));
182  i = __kmp_gtid_get_specific();
183 
184  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
185 
186  /* if we havn't been assigned a gtid, then return code */
187  if (i < 0)
188  return i;
189 
190  /* dynamically updated stack window for uber threads to avoid get_specific
191  call */
192  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
193  KMP_FATAL(StackOverflow, i);
194  }
195 
196  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
197  if (stack_addr > stack_base) {
198  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
199  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
201  stack_base);
202  } else {
203  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204  stack_base - stack_addr);
205  }
206 
207  /* Reprint stack bounds for ubermaster since they have been refined */
208  if (__kmp_storage_map) {
209  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
210  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
211  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
212  other_threads[i]->th.th_info.ds.ds_stacksize,
213  "th_%d stack (refinement)", i);
214  }
215  return i;
216 }
217 
218 int __kmp_get_global_thread_id_reg() {
219  int gtid;
220 
221  if (!__kmp_init_serial) {
222  gtid = KMP_GTID_DNE;
223  } else
224 #ifdef KMP_TDATA_GTID
225  if (TCR_4(__kmp_gtid_mode) >= 3) {
226  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
227  gtid = __kmp_gtid;
228  } else
229 #endif
230  if (TCR_4(__kmp_gtid_mode) >= 2) {
231  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
232  gtid = __kmp_gtid_get_specific();
233  } else {
234  KA_TRACE(1000,
235  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
236  gtid = __kmp_get_global_thread_id();
237  }
238 
239  /* we must be a new uber master sibling thread */
240  if (gtid == KMP_GTID_DNE) {
241  KA_TRACE(10,
242  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
243  "Registering a new gtid.\n"));
244  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
245  if (!__kmp_init_serial) {
246  __kmp_do_serial_initialize();
247  gtid = __kmp_gtid_get_specific();
248  } else {
249  gtid = __kmp_register_root(FALSE);
250  }
251  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
252  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
253  }
254 
255  KMP_DEBUG_ASSERT(gtid >= 0);
256 
257  return gtid;
258 }
259 
260 /* caller must hold forkjoin_lock */
261 void __kmp_check_stack_overlap(kmp_info_t *th) {
262  int f;
263  char *stack_beg = NULL;
264  char *stack_end = NULL;
265  int gtid;
266 
267  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
268  if (__kmp_storage_map) {
269  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
270  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
271 
272  gtid = __kmp_gtid_from_thread(th);
273 
274  if (gtid == KMP_GTID_MONITOR) {
275  __kmp_print_storage_map_gtid(
276  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
277  "th_%s stack (%s)", "mon",
278  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
279  } else {
280  __kmp_print_storage_map_gtid(
281  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282  "th_%d stack (%s)", gtid,
283  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284  }
285  }
286 
287  /* No point in checking ubermaster threads since they use refinement and
288  * cannot overlap */
289  gtid = __kmp_gtid_from_thread(th);
290  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
291  KA_TRACE(10,
292  ("__kmp_check_stack_overlap: performing extensive checking\n"));
293  if (stack_beg == NULL) {
294  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
295  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296  }
297 
298  for (f = 0; f < __kmp_threads_capacity; f++) {
299  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300 
301  if (f_th && f_th != th) {
302  char *other_stack_end =
303  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304  char *other_stack_beg =
305  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308 
309  /* Print the other stack values before the abort */
310  if (__kmp_storage_map)
311  __kmp_print_storage_map_gtid(
312  -1, other_stack_beg, other_stack_end,
313  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
314  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
315 
316  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
317  __kmp_msg_null);
318  }
319  }
320  }
321  }
322  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
323 }
324 
325 /* ------------------------------------------------------------------------ */
326 
327 void __kmp_infinite_loop(void) {
328  static int done = FALSE;
329 
330  while (!done) {
331  KMP_YIELD(1);
332  }
333 }
334 
335 #define MAX_MESSAGE 512
336 
337 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
338  char const *format, ...) {
339  char buffer[MAX_MESSAGE];
340  va_list ap;
341 
342  va_start(ap, format);
343  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
344  p2, (unsigned long)size, format);
345  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
346  __kmp_vprintf(kmp_err, buffer, ap);
347 #if KMP_PRINT_DATA_PLACEMENT
348  int node;
349  if (gtid >= 0) {
350  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
351  if (__kmp_storage_map_verbose) {
352  node = __kmp_get_host_node(p1);
353  if (node < 0) /* doesn't work, so don't try this next time */
354  __kmp_storage_map_verbose = FALSE;
355  else {
356  char *last;
357  int lastNode;
358  int localProc = __kmp_get_cpu_from_gtid(gtid);
359 
360  const int page_size = KMP_GET_PAGE_SIZE();
361 
362  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
363  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
364  if (localProc >= 0)
365  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
366  localProc >> 1);
367  else
368  __kmp_printf_no_lock(" GTID %d\n", gtid);
369 #if KMP_USE_PRCTL
370  /* The more elaborate format is disabled for now because of the prctl
371  * hanging bug. */
372  do {
373  last = p1;
374  lastNode = node;
375  /* This loop collates adjacent pages with the same host node. */
376  do {
377  (char *)p1 += page_size;
378  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
379  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
380  lastNode);
381  } while (p1 <= p2);
382 #else
383  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
384  (char *)p1 + (page_size - 1),
385  __kmp_get_host_node(p1));
386  if (p1 < p2) {
387  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
388  (char *)p2 + (page_size - 1),
389  __kmp_get_host_node(p2));
390  }
391 #endif
392  }
393  }
394  } else
395  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
396  }
397 #endif /* KMP_PRINT_DATA_PLACEMENT */
398  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
399 }
400 
401 void __kmp_warn(char const *format, ...) {
402  char buffer[MAX_MESSAGE];
403  va_list ap;
404 
405  if (__kmp_generate_warnings == kmp_warnings_off) {
406  return;
407  }
408 
409  va_start(ap, format);
410 
411  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
412  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
413  __kmp_vprintf(kmp_err, buffer, ap);
414  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
415 
416  va_end(ap);
417 }
418 
419 void __kmp_abort_process() {
420  // Later threads may stall here, but that's ok because abort() will kill them.
421  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
422 
423  if (__kmp_debug_buf) {
424  __kmp_dump_debug_buffer();
425  }
426 
427  if (KMP_OS_WINDOWS) {
428  // Let other threads know of abnormal termination and prevent deadlock
429  // if abort happened during library initialization or shutdown
430  __kmp_global.g.g_abort = SIGABRT;
431 
432  /* On Windows* OS by default abort() causes pop-up error box, which stalls
433  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
434  boxes. _set_abort_behavior() works well, but this function is not
435  available in VS7 (this is not problem for DLL, but it is a problem for
436  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
437  help, at least in some versions of MS C RTL.
438 
439  It seems following sequence is the only way to simulate abort() and
440  avoid pop-up error box. */
441  raise(SIGABRT);
442  _exit(3); // Just in case, if signal ignored, exit anyway.
443  } else {
444  abort();
445  }
446 
447  __kmp_infinite_loop();
448  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
452 void __kmp_abort_thread(void) {
453  // TODO: Eliminate g_abort global variable and this function.
454  // In case of abort just call abort(), it will kill all the threads.
455  __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459  that are allocated together. */
460 
461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463  gtid);
464 
465  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469  sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471  __kmp_print_storage_map_gtid(
472  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476  &thr->th.th_bar[bs_plain_barrier + 1],
477  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478  gtid);
479 
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481  &thr->th.th_bar[bs_forkjoin_barrier + 1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483  gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487  &thr->th.th_bar[bs_reduction_barrier + 1],
488  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489  gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494  that are allocated together. */
495 
496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497  int team_id, int num_thr) {
498  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500  header, team_id);
501 
502  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503  &team->t.t_bar[bs_last_barrier],
504  sizeof(kmp_balign_team_t) * bs_last_barrier,
505  "%s_%d.t_bar", header, team_id);
506 
507  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508  &team->t.t_bar[bs_plain_barrier + 1],
509  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513  &team->t.t_bar[bs_forkjoin_barrier + 1],
514  sizeof(kmp_balign_team_t),
515  "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519  &team->t.t_bar[bs_reduction_barrier + 1],
520  sizeof(kmp_balign_team_t),
521  "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524  __kmp_print_storage_map_gtid(
525  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528  __kmp_print_storage_map_gtid(
529  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533  &team->t.t_disp_buffer[num_disp_buff],
534  sizeof(dispatch_shared_info_t) * num_disp_buff,
535  "%s_%d.t_disp_buffer", header, team_id);
536 
537  __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
538  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
539  team_id);
540 }
541 
542 static void __kmp_init_allocator() {
543 #if OMP_50_ENABLED
544  __kmp_init_memkind();
545 #endif
546 }
547 static void __kmp_fini_allocator() {
548 #if OMP_50_ENABLED
549  __kmp_fini_memkind();
550 #endif
551 }
552 
553 /* ------------------------------------------------------------------------ */
554 
555 #if KMP_DYNAMIC_LIB
556 #if KMP_OS_WINDOWS
557 
558 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
559  // TODO: Change to __kmp_break_bootstrap_lock().
560  __kmp_init_bootstrap_lock(lck); // make the lock released
561 }
562 
563 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
564  int i;
565  int thread_count;
566 
567  // PROCESS_DETACH is expected to be called by a thread that executes
568  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
569  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
570  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
571  // threads can be still alive here, although being about to be terminated. The
572  // threads in the array with ds_thread==0 are most suspicious. Actually, it
573  // can be not safe to access the __kmp_threads[].
574 
575  // TODO: does it make sense to check __kmp_roots[] ?
576 
577  // Let's check that there are no other alive threads registered with the OMP
578  // lib.
579  while (1) {
580  thread_count = 0;
581  for (i = 0; i < __kmp_threads_capacity; ++i) {
582  if (!__kmp_threads)
583  continue;
584  kmp_info_t *th = __kmp_threads[i];
585  if (th == NULL)
586  continue;
587  int gtid = th->th.th_info.ds.ds_gtid;
588  if (gtid == gtid_req)
589  continue;
590  if (gtid < 0)
591  continue;
592  DWORD exit_val;
593  int alive = __kmp_is_thread_alive(th, &exit_val);
594  if (alive) {
595  ++thread_count;
596  }
597  }
598  if (thread_count == 0)
599  break; // success
600  }
601 
602  // Assume that I'm alone. Now it might be safe to check and reset locks.
603  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
604  __kmp_reset_lock(&__kmp_forkjoin_lock);
605 #ifdef KMP_DEBUG
606  __kmp_reset_lock(&__kmp_stdio_lock);
607 #endif // KMP_DEBUG
608 }
609 
610 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
611  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
612 
613  switch (fdwReason) {
614 
615  case DLL_PROCESS_ATTACH:
616  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
617 
618  return TRUE;
619 
620  case DLL_PROCESS_DETACH:
621  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
622 
623  if (lpReserved != NULL) {
624  // lpReserved is used for telling the difference:
625  // lpReserved == NULL when FreeLibrary() was called,
626  // lpReserved != NULL when the process terminates.
627  // When FreeLibrary() is called, worker threads remain alive. So they will
628  // release the forkjoin lock by themselves. When the process terminates,
629  // worker threads disappear triggering the problem of unreleased forkjoin
630  // lock as described below.
631 
632  // A worker thread can take the forkjoin lock. The problem comes up if
633  // that worker thread becomes dead before it releases the forkjoin lock.
634  // The forkjoin lock remains taken, while the thread executing
635  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
636  // to take the forkjoin lock and will always fail, so that the application
637  // will never finish [normally]. This scenario is possible if
638  // __kmpc_end() has not been executed. It looks like it's not a corner
639  // case, but common cases:
640  // - the main function was compiled by an alternative compiler;
641  // - the main function was compiled by icl but without /Qopenmp
642  // (application with plugins);
643  // - application terminates by calling C exit(), Fortran CALL EXIT() or
644  // Fortran STOP.
645  // - alive foreign thread prevented __kmpc_end from doing cleanup.
646  //
647  // This is a hack to work around the problem.
648  // TODO: !!! figure out something better.
649  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
650  }
651 
652  __kmp_internal_end_library(__kmp_gtid_get_specific());
653 
654  return TRUE;
655 
656  case DLL_THREAD_ATTACH:
657  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
658 
659  /* if we want to register new siblings all the time here call
660  * __kmp_get_gtid(); */
661  return TRUE;
662 
663  case DLL_THREAD_DETACH:
664  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
665 
666  __kmp_internal_end_thread(__kmp_gtid_get_specific());
667  return TRUE;
668  }
669 
670  return TRUE;
671 }
672 
673 #endif /* KMP_OS_WINDOWS */
674 #endif /* KMP_DYNAMIC_LIB */
675 
676 /* Change the library type to "status" and return the old type */
677 /* called from within initialization routines where __kmp_initz_lock is held */
678 int __kmp_change_library(int status) {
679  int old_status;
680 
681  old_status = __kmp_yield_init &
682  1; // check whether KMP_LIBRARY=throughput (even init count)
683 
684  if (status) {
685  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
686  } else {
687  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
688  }
689 
690  return old_status; // return previous setting of whether
691  // KMP_LIBRARY=throughput
692 }
693 
694 /* __kmp_parallel_deo -- Wait until it's our turn. */
695 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696  int gtid = *gtid_ref;
697 #ifdef BUILD_PARALLEL_ORDERED
698  kmp_team_t *team = __kmp_team_from_gtid(gtid);
699 #endif /* BUILD_PARALLEL_ORDERED */
700 
701  if (__kmp_env_consistency_check) {
702  if (__kmp_threads[gtid]->th.th_root->r.r_active)
703 #if KMP_USE_DYNAMIC_LOCK
704  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
705 #else
706  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
707 #endif
708  }
709 #ifdef BUILD_PARALLEL_ORDERED
710  if (!team->t.t_serialized) {
711  KMP_MB();
712  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
713  KMP_EQ, NULL);
714  KMP_MB();
715  }
716 #endif /* BUILD_PARALLEL_ORDERED */
717 }
718 
719 /* __kmp_parallel_dxo -- Signal the next task. */
720 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
721  int gtid = *gtid_ref;
722 #ifdef BUILD_PARALLEL_ORDERED
723  int tid = __kmp_tid_from_gtid(gtid);
724  kmp_team_t *team = __kmp_team_from_gtid(gtid);
725 #endif /* BUILD_PARALLEL_ORDERED */
726 
727  if (__kmp_env_consistency_check) {
728  if (__kmp_threads[gtid]->th.th_root->r.r_active)
729  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
730  }
731 #ifdef BUILD_PARALLEL_ORDERED
732  if (!team->t.t_serialized) {
733  KMP_MB(); /* Flush all pending memory write invalidates. */
734 
735  /* use the tid of the next thread in this team */
736  /* TODO replace with general release procedure */
737  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
738 
739  KMP_MB(); /* Flush all pending memory write invalidates. */
740  }
741 #endif /* BUILD_PARALLEL_ORDERED */
742 }
743 
744 /* ------------------------------------------------------------------------ */
745 /* The BARRIER for a SINGLE process section is always explicit */
746 
747 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
748  int status;
749  kmp_info_t *th;
750  kmp_team_t *team;
751 
752  if (!TCR_4(__kmp_init_parallel))
753  __kmp_parallel_initialize();
754 
755 #if OMP_50_ENABLED
756  __kmp_resume_if_soft_paused();
757 #endif
758 
759  th = __kmp_threads[gtid];
760  team = th->th.th_team;
761  status = 0;
762 
763  th->th.th_ident = id_ref;
764 
765  if (team->t.t_serialized) {
766  status = 1;
767  } else {
768  kmp_int32 old_this = th->th.th_local.this_construct;
769 
770  ++th->th.th_local.this_construct;
771  /* try to set team count to thread count--success means thread got the
772  single block */
773  /* TODO: Should this be acquire or release? */
774  if (team->t.t_construct == old_this) {
775  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
776  th->th.th_local.this_construct);
777  }
778 #if USE_ITT_BUILD
779  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
780  KMP_MASTER_GTID(gtid) &&
781 #if OMP_40_ENABLED
782  th->th.th_teams_microtask == NULL &&
783 #endif
784  team->t.t_active_level ==
785  1) { // Only report metadata by master of active team at level 1
786  __kmp_itt_metadata_single(id_ref);
787  }
788 #endif /* USE_ITT_BUILD */
789  }
790 
791  if (__kmp_env_consistency_check) {
792  if (status && push_ws) {
793  __kmp_push_workshare(gtid, ct_psingle, id_ref);
794  } else {
795  __kmp_check_workshare(gtid, ct_psingle, id_ref);
796  }
797  }
798 #if USE_ITT_BUILD
799  if (status) {
800  __kmp_itt_single_start(gtid);
801  }
802 #endif /* USE_ITT_BUILD */
803  return status;
804 }
805 
806 void __kmp_exit_single(int gtid) {
807 #if USE_ITT_BUILD
808  __kmp_itt_single_end(gtid);
809 #endif /* USE_ITT_BUILD */
810  if (__kmp_env_consistency_check)
811  __kmp_pop_workshare(gtid, ct_psingle, NULL);
812 }
813 
814 /* determine if we can go parallel or must use a serialized parallel region and
815  * how many threads we can use
816  * set_nproc is the number of threads requested for the team
817  * returns 0 if we should serialize or only use one thread,
818  * otherwise the number of threads to use
819  * The forkjoin lock is held by the caller. */
820 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
821  int master_tid, int set_nthreads
822 #if OMP_40_ENABLED
823  ,
824  int enter_teams
825 #endif /* OMP_40_ENABLED */
826  ) {
827  int capacity;
828  int new_nthreads;
829  KMP_DEBUG_ASSERT(__kmp_init_serial);
830  KMP_DEBUG_ASSERT(root && parent_team);
831 
832  // If dyn-var is set, dynamically adjust the number of desired threads,
833  // according to the method specified by dynamic_mode.
834  new_nthreads = set_nthreads;
835  if (!get__dynamic_2(parent_team, master_tid)) {
836  ;
837  }
838 #ifdef USE_LOAD_BALANCE
839  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
840  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
841  if (new_nthreads == 1) {
842  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
843  "reservation to 1 thread\n",
844  master_tid));
845  return 1;
846  }
847  if (new_nthreads < set_nthreads) {
848  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
849  "reservation to %d threads\n",
850  master_tid, new_nthreads));
851  }
852  }
853 #endif /* USE_LOAD_BALANCE */
854  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
855  new_nthreads = __kmp_avail_proc - __kmp_nth +
856  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
857  if (new_nthreads <= 1) {
858  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
859  "reservation to 1 thread\n",
860  master_tid));
861  return 1;
862  }
863  if (new_nthreads < set_nthreads) {
864  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
865  "reservation to %d threads\n",
866  master_tid, new_nthreads));
867  } else {
868  new_nthreads = set_nthreads;
869  }
870  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
871  if (set_nthreads > 2) {
872  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
873  new_nthreads = (new_nthreads % set_nthreads) + 1;
874  if (new_nthreads == 1) {
875  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
876  "reservation to 1 thread\n",
877  master_tid));
878  return 1;
879  }
880  if (new_nthreads < set_nthreads) {
881  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
882  "reservation to %d threads\n",
883  master_tid, new_nthreads));
884  }
885  }
886  } else {
887  KMP_ASSERT(0);
888  }
889 
890  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
891  if (__kmp_nth + new_nthreads -
892  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
893  __kmp_max_nth) {
894  int tl_nthreads = __kmp_max_nth - __kmp_nth +
895  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
896  if (tl_nthreads <= 0) {
897  tl_nthreads = 1;
898  }
899 
900  // If dyn-var is false, emit a 1-time warning.
901  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
902  __kmp_reserve_warn = 1;
903  __kmp_msg(kmp_ms_warning,
904  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
905  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
906  }
907  if (tl_nthreads == 1) {
908  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
909  "reduced reservation to 1 thread\n",
910  master_tid));
911  return 1;
912  }
913  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
914  "reservation to %d threads\n",
915  master_tid, tl_nthreads));
916  new_nthreads = tl_nthreads;
917  }
918 
919  // Respect OMP_THREAD_LIMIT
920  if (root->r.r_cg_nthreads + new_nthreads -
921  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
922  __kmp_cg_max_nth) {
923  int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
924  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
925  if (tl_nthreads <= 0) {
926  tl_nthreads = 1;
927  }
928 
929  // If dyn-var is false, emit a 1-time warning.
930  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
931  __kmp_reserve_warn = 1;
932  __kmp_msg(kmp_ms_warning,
933  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
934  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
935  }
936  if (tl_nthreads == 1) {
937  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
938  "reduced reservation to 1 thread\n",
939  master_tid));
940  return 1;
941  }
942  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
943  "reservation to %d threads\n",
944  master_tid, tl_nthreads));
945  new_nthreads = tl_nthreads;
946  }
947 
948  // Check if the threads array is large enough, or needs expanding.
949  // See comment in __kmp_register_root() about the adjustment if
950  // __kmp_threads[0] == NULL.
951  capacity = __kmp_threads_capacity;
952  if (TCR_PTR(__kmp_threads[0]) == NULL) {
953  --capacity;
954  }
955  if (__kmp_nth + new_nthreads -
956  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
957  capacity) {
958  // Expand the threads array.
959  int slotsRequired = __kmp_nth + new_nthreads -
960  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
961  capacity;
962  int slotsAdded = __kmp_expand_threads(slotsRequired);
963  if (slotsAdded < slotsRequired) {
964  // The threads array was not expanded enough.
965  new_nthreads -= (slotsRequired - slotsAdded);
966  KMP_ASSERT(new_nthreads >= 1);
967 
968  // If dyn-var is false, emit a 1-time warning.
969  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
970  __kmp_reserve_warn = 1;
971  if (__kmp_tp_cached) {
972  __kmp_msg(kmp_ms_warning,
973  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
974  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
975  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
976  } else {
977  __kmp_msg(kmp_ms_warning,
978  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
979  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
980  }
981  }
982  }
983  }
984 
985 #ifdef KMP_DEBUG
986  if (new_nthreads == 1) {
987  KC_TRACE(10,
988  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
989  "dead roots and rechecking; requested %d threads\n",
990  __kmp_get_gtid(), set_nthreads));
991  } else {
992  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
993  " %d threads\n",
994  __kmp_get_gtid(), new_nthreads, set_nthreads));
995  }
996 #endif // KMP_DEBUG
997  return new_nthreads;
998 }
999 
1000 /* Allocate threads from the thread pool and assign them to the new team. We are
1001  assured that there are enough threads available, because we checked on that
1002  earlier within critical section forkjoin */
1003 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
1004  kmp_info_t *master_th, int master_gtid) {
1005  int i;
1006  int use_hot_team;
1007 
1008  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
1009  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
1010  KMP_MB();
1011 
1012  /* first, let's setup the master thread */
1013  master_th->th.th_info.ds.ds_tid = 0;
1014  master_th->th.th_team = team;
1015  master_th->th.th_team_nproc = team->t.t_nproc;
1016  master_th->th.th_team_master = master_th;
1017  master_th->th.th_team_serialized = FALSE;
1018  master_th->th.th_dispatch = &team->t.t_dispatch[0];
1019 
1020 /* make sure we are not the optimized hot team */
1021 #if KMP_NESTED_HOT_TEAMS
1022  use_hot_team = 0;
1023  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1024  if (hot_teams) { // hot teams array is not allocated if
1025  // KMP_HOT_TEAMS_MAX_LEVEL=0
1026  int level = team->t.t_active_level - 1; // index in array of hot teams
1027  if (master_th->th.th_teams_microtask) { // are we inside the teams?
1028  if (master_th->th.th_teams_size.nteams > 1) {
1029  ++level; // level was not increased in teams construct for
1030  // team_of_masters
1031  }
1032  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1033  master_th->th.th_teams_level == team->t.t_level) {
1034  ++level; // level was not increased in teams construct for
1035  // team_of_workers before the parallel
1036  } // team->t.t_level will be increased inside parallel
1037  }
1038  if (level < __kmp_hot_teams_max_level) {
1039  if (hot_teams[level].hot_team) {
1040  // hot team has already been allocated for given level
1041  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1042  use_hot_team = 1; // the team is ready to use
1043  } else {
1044  use_hot_team = 0; // AC: threads are not allocated yet
1045  hot_teams[level].hot_team = team; // remember new hot team
1046  hot_teams[level].hot_team_nth = team->t.t_nproc;
1047  }
1048  } else {
1049  use_hot_team = 0;
1050  }
1051  }
1052 #else
1053  use_hot_team = team == root->r.r_hot_team;
1054 #endif
1055  if (!use_hot_team) {
1056 
1057  /* install the master thread */
1058  team->t.t_threads[0] = master_th;
1059  __kmp_initialize_info(master_th, team, 0, master_gtid);
1060 
1061  /* now, install the worker threads */
1062  for (i = 1; i < team->t.t_nproc; i++) {
1063 
1064  /* fork or reallocate a new thread and install it in team */
1065  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1066  team->t.t_threads[i] = thr;
1067  KMP_DEBUG_ASSERT(thr);
1068  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1069  /* align team and thread arrived states */
1070  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1071  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1072  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1073  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1074  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1075  team->t.t_bar[bs_plain_barrier].b_arrived));
1076 #if OMP_40_ENABLED
1077  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1078  thr->th.th_teams_level = master_th->th.th_teams_level;
1079  thr->th.th_teams_size = master_th->th.th_teams_size;
1080 #endif
1081  { // Initialize threads' barrier data.
1082  int b;
1083  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1084  for (b = 0; b < bs_last_barrier; ++b) {
1085  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1086  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1087 #if USE_DEBUGGER
1088  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1089 #endif
1090  }
1091  }
1092  }
1093 
1094 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1095  __kmp_partition_places(team);
1096 #endif
1097  }
1098 
1099 #if OMP_50_ENABLED
1100  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1101  for (i = 0; i < team->t.t_nproc; i++) {
1102  kmp_info_t *thr = team->t.t_threads[i];
1103  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1104  thr->th.th_prev_level != team->t.t_level) {
1105  team->t.t_display_affinity = 1;
1106  break;
1107  }
1108  }
1109  }
1110 #endif
1111 
1112  KMP_MB();
1113 }
1114 
1115 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1116 // Propagate any changes to the floating point control registers out to the team
1117 // We try to avoid unnecessary writes to the relevant cache line in the team
1118 // structure, so we don't make changes unless they are needed.
1119 inline static void propagateFPControl(kmp_team_t *team) {
1120  if (__kmp_inherit_fp_control) {
1121  kmp_int16 x87_fpu_control_word;
1122  kmp_uint32 mxcsr;
1123 
1124  // Get master values of FPU control flags (both X87 and vector)
1125  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1126  __kmp_store_mxcsr(&mxcsr);
1127  mxcsr &= KMP_X86_MXCSR_MASK;
1128 
1129  // There is no point looking at t_fp_control_saved here.
1130  // If it is TRUE, we still have to update the values if they are different
1131  // from those we now have. If it is FALSE we didn't save anything yet, but
1132  // our objective is the same. We have to ensure that the values in the team
1133  // are the same as those we have.
1134  // So, this code achieves what we need whether or not t_fp_control_saved is
1135  // true. By checking whether the value needs updating we avoid unnecessary
1136  // writes that would put the cache-line into a written state, causing all
1137  // threads in the team to have to read it again.
1138  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1139  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1140  // Although we don't use this value, other code in the runtime wants to know
1141  // whether it should restore them. So we must ensure it is correct.
1142  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1143  } else {
1144  // Similarly here. Don't write to this cache-line in the team structure
1145  // unless we have to.
1146  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1147  }
1148 }
1149 
1150 // Do the opposite, setting the hardware registers to the updated values from
1151 // the team.
1152 inline static void updateHWFPControl(kmp_team_t *team) {
1153  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1154  // Only reset the fp control regs if they have been changed in the team.
1155  // the parallel region that we are exiting.
1156  kmp_int16 x87_fpu_control_word;
1157  kmp_uint32 mxcsr;
1158  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1159  __kmp_store_mxcsr(&mxcsr);
1160  mxcsr &= KMP_X86_MXCSR_MASK;
1161 
1162  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1163  __kmp_clear_x87_fpu_status_word();
1164  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1165  }
1166 
1167  if (team->t.t_mxcsr != mxcsr) {
1168  __kmp_load_mxcsr(&team->t.t_mxcsr);
1169  }
1170  }
1171 }
1172 #else
1173 #define propagateFPControl(x) ((void)0)
1174 #define updateHWFPControl(x) ((void)0)
1175 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1176 
1177 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1178  int realloc); // forward declaration
1179 
1180 /* Run a parallel region that has been serialized, so runs only in a team of the
1181  single master thread. */
1182 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1183  kmp_info_t *this_thr;
1184  kmp_team_t *serial_team;
1185 
1186  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1187 
1188  /* Skip all this code for autopar serialized loops since it results in
1189  unacceptable overhead */
1190  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1191  return;
1192 
1193  if (!TCR_4(__kmp_init_parallel))
1194  __kmp_parallel_initialize();
1195 
1196 #if OMP_50_ENABLED
1197  __kmp_resume_if_soft_paused();
1198 #endif
1199 
1200  this_thr = __kmp_threads[global_tid];
1201  serial_team = this_thr->th.th_serial_team;
1202 
1203  /* utilize the serialized team held by this thread */
1204  KMP_DEBUG_ASSERT(serial_team);
1205  KMP_MB();
1206 
1207  if (__kmp_tasking_mode != tskm_immediate_exec) {
1208  KMP_DEBUG_ASSERT(
1209  this_thr->th.th_task_team ==
1210  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1211  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1212  NULL);
1213  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1214  "team %p, new task_team = NULL\n",
1215  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1216  this_thr->th.th_task_team = NULL;
1217  }
1218 
1219 #if OMP_40_ENABLED
1220  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1221  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1222  proc_bind = proc_bind_false;
1223  } else if (proc_bind == proc_bind_default) {
1224  // No proc_bind clause was specified, so use the current value
1225  // of proc-bind-var for this parallel region.
1226  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1227  }
1228  // Reset for next parallel region
1229  this_thr->th.th_set_proc_bind = proc_bind_default;
1230 #endif /* OMP_40_ENABLED */
1231 
1232 #if OMPT_SUPPORT
1233  ompt_data_t ompt_parallel_data = ompt_data_none;
1234  ompt_data_t *implicit_task_data;
1235  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1236  if (ompt_enabled.enabled &&
1237  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1238 
1239  ompt_task_info_t *parent_task_info;
1240  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1241 
1242  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1243  if (ompt_enabled.ompt_callback_parallel_begin) {
1244  int team_size = 1;
1245 
1246  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1247  &(parent_task_info->task_data), &(parent_task_info->frame),
1248  &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1249  codeptr);
1250  }
1251  }
1252 #endif // OMPT_SUPPORT
1253 
1254  if (this_thr->th.th_team != serial_team) {
1255  // Nested level will be an index in the nested nthreads array
1256  int level = this_thr->th.th_team->t.t_level;
1257 
1258  if (serial_team->t.t_serialized) {
1259  /* this serial team was already used
1260  TODO increase performance by making this locks more specific */
1261  kmp_team_t *new_team;
1262 
1263  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1264 
1265  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1266 #if OMPT_SUPPORT
1267  ompt_parallel_data,
1268 #endif
1269 #if OMP_40_ENABLED
1270  proc_bind,
1271 #endif
1272  &this_thr->th.th_current_task->td_icvs,
1273  0 USE_NESTED_HOT_ARG(NULL));
1274  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1275  KMP_ASSERT(new_team);
1276 
1277  /* setup new serialized team and install it */
1278  new_team->t.t_threads[0] = this_thr;
1279  new_team->t.t_parent = this_thr->th.th_team;
1280  serial_team = new_team;
1281  this_thr->th.th_serial_team = serial_team;
1282 
1283  KF_TRACE(
1284  10,
1285  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1286  global_tid, serial_team));
1287 
1288  /* TODO the above breaks the requirement that if we run out of resources,
1289  then we can still guarantee that serialized teams are ok, since we may
1290  need to allocate a new one */
1291  } else {
1292  KF_TRACE(
1293  10,
1294  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1295  global_tid, serial_team));
1296  }
1297 
1298  /* we have to initialize this serial team */
1299  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1300  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1301  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1302  serial_team->t.t_ident = loc;
1303  serial_team->t.t_serialized = 1;
1304  serial_team->t.t_nproc = 1;
1305  serial_team->t.t_parent = this_thr->th.th_team;
1306  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1307  this_thr->th.th_team = serial_team;
1308  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1309 
1310  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1311  this_thr->th.th_current_task));
1312  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1313  this_thr->th.th_current_task->td_flags.executing = 0;
1314 
1315  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1316 
1317  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1318  implicit task for each serialized task represented by
1319  team->t.t_serialized? */
1320  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1321  &this_thr->th.th_current_task->td_parent->td_icvs);
1322 
1323  // Thread value exists in the nested nthreads array for the next nested
1324  // level
1325  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1326  this_thr->th.th_current_task->td_icvs.nproc =
1327  __kmp_nested_nth.nth[level + 1];
1328  }
1329 
1330 #if OMP_40_ENABLED
1331  if (__kmp_nested_proc_bind.used &&
1332  (level + 1 < __kmp_nested_proc_bind.used)) {
1333  this_thr->th.th_current_task->td_icvs.proc_bind =
1334  __kmp_nested_proc_bind.bind_types[level + 1];
1335  }
1336 #endif /* OMP_40_ENABLED */
1337 
1338 #if USE_DEBUGGER
1339  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1340 #endif
1341  this_thr->th.th_info.ds.ds_tid = 0;
1342 
1343  /* set thread cache values */
1344  this_thr->th.th_team_nproc = 1;
1345  this_thr->th.th_team_master = this_thr;
1346  this_thr->th.th_team_serialized = 1;
1347 
1348  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1349  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1350 #if OMP_50_ENABLED
1351  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1352 #endif
1353 
1354  propagateFPControl(serial_team);
1355 
1356  /* check if we need to allocate dispatch buffers stack */
1357  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1358  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1359  serial_team->t.t_dispatch->th_disp_buffer =
1360  (dispatch_private_info_t *)__kmp_allocate(
1361  sizeof(dispatch_private_info_t));
1362  }
1363  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1364 
1365  KMP_MB();
1366 
1367  } else {
1368  /* this serialized team is already being used,
1369  * that's fine, just add another nested level */
1370  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1371  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1372  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1373  ++serial_team->t.t_serialized;
1374  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1375 
1376  // Nested level will be an index in the nested nthreads array
1377  int level = this_thr->th.th_team->t.t_level;
1378  // Thread value exists in the nested nthreads array for the next nested
1379  // level
1380  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1381  this_thr->th.th_current_task->td_icvs.nproc =
1382  __kmp_nested_nth.nth[level + 1];
1383  }
1384  serial_team->t.t_level++;
1385  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1386  "of serial team %p to %d\n",
1387  global_tid, serial_team, serial_team->t.t_level));
1388 
1389  /* allocate/push dispatch buffers stack */
1390  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1391  {
1392  dispatch_private_info_t *disp_buffer =
1393  (dispatch_private_info_t *)__kmp_allocate(
1394  sizeof(dispatch_private_info_t));
1395  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1396  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1397  }
1398  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1399 
1400  KMP_MB();
1401  }
1402 #if OMP_40_ENABLED
1403  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1404 #endif
1405 
1406 #if OMP_50_ENABLED
1407  // Perform the display affinity functionality for
1408  // serialized parallel regions
1409  if (__kmp_display_affinity) {
1410  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1411  this_thr->th.th_prev_num_threads != 1) {
1412  // NULL means use the affinity-format-var ICV
1413  __kmp_aux_display_affinity(global_tid, NULL);
1414  this_thr->th.th_prev_level = serial_team->t.t_level;
1415  this_thr->th.th_prev_num_threads = 1;
1416  }
1417  }
1418 #endif
1419 
1420  if (__kmp_env_consistency_check)
1421  __kmp_push_parallel(global_tid, NULL);
1422 #if OMPT_SUPPORT
1423  serial_team->t.ompt_team_info.master_return_address = codeptr;
1424  if (ompt_enabled.enabled &&
1425  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1426  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1427 
1428  ompt_lw_taskteam_t lw_taskteam;
1429  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1430  &ompt_parallel_data, codeptr);
1431 
1432  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1433  // don't use lw_taskteam after linking. content was swaped
1434 
1435  /* OMPT implicit task begin */
1436  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1437  if (ompt_enabled.ompt_callback_implicit_task) {
1438  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1439  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1440  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1441  OMPT_CUR_TASK_INFO(this_thr)
1442  ->thread_num = __kmp_tid_from_gtid(global_tid);
1443  }
1444 
1445  /* OMPT state */
1446  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1447  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1448  }
1449 #endif
1450 }
1451 
1452 /* most of the work for a fork */
1453 /* return true if we really went parallel, false if serialized */
1454 int __kmp_fork_call(ident_t *loc, int gtid,
1455  enum fork_context_e call_context, // Intel, GNU, ...
1456  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1457 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1458 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1459  va_list *ap
1460 #else
1461  va_list ap
1462 #endif
1463  ) {
1464  void **argv;
1465  int i;
1466  int master_tid;
1467  int master_this_cons;
1468  kmp_team_t *team;
1469  kmp_team_t *parent_team;
1470  kmp_info_t *master_th;
1471  kmp_root_t *root;
1472  int nthreads;
1473  int master_active;
1474  int master_set_numthreads;
1475  int level;
1476 #if OMP_40_ENABLED
1477  int active_level;
1478  int teams_level;
1479 #endif
1480 #if KMP_NESTED_HOT_TEAMS
1481  kmp_hot_team_ptr_t **p_hot_teams;
1482 #endif
1483  { // KMP_TIME_BLOCK
1484  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1485  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1486 
1487  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1488  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1489  /* Some systems prefer the stack for the root thread(s) to start with */
1490  /* some gap from the parent stack to prevent false sharing. */
1491  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1492  /* These 2 lines below are so this does not get optimized out */
1493  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1494  __kmp_stkpadding += (short)((kmp_int64)dummy);
1495  }
1496 
1497  /* initialize if needed */
1498  KMP_DEBUG_ASSERT(
1499  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1500  if (!TCR_4(__kmp_init_parallel))
1501  __kmp_parallel_initialize();
1502 
1503 #if OMP_50_ENABLED
1504  __kmp_resume_if_soft_paused();
1505 #endif
1506 
1507  /* setup current data */
1508  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1509  // shutdown
1510  parent_team = master_th->th.th_team;
1511  master_tid = master_th->th.th_info.ds.ds_tid;
1512  master_this_cons = master_th->th.th_local.this_construct;
1513  root = master_th->th.th_root;
1514  master_active = root->r.r_active;
1515  master_set_numthreads = master_th->th.th_set_nproc;
1516 
1517 #if OMPT_SUPPORT
1518  ompt_data_t ompt_parallel_data = ompt_data_none;
1519  ompt_data_t *parent_task_data;
1520  ompt_frame_t *ompt_frame;
1521  ompt_data_t *implicit_task_data;
1522  void *return_address = NULL;
1523 
1524  if (ompt_enabled.enabled) {
1525  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1526  NULL, NULL);
1527  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1528  }
1529 #endif
1530 
1531  // Nested level will be an index in the nested nthreads array
1532  level = parent_team->t.t_level;
1533  // used to launch non-serial teams even if nested is not allowed
1534  active_level = parent_team->t.t_active_level;
1535 #if OMP_40_ENABLED
1536  // needed to check nesting inside the teams
1537  teams_level = master_th->th.th_teams_level;
1538 #endif
1539 #if KMP_NESTED_HOT_TEAMS
1540  p_hot_teams = &master_th->th.th_hot_teams;
1541  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1542  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1543  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1544  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1545  // it is either actual or not needed (when active_level > 0)
1546  (*p_hot_teams)[0].hot_team_nth = 1;
1547  }
1548 #endif
1549 
1550 #if OMPT_SUPPORT
1551  if (ompt_enabled.enabled) {
1552  if (ompt_enabled.ompt_callback_parallel_begin) {
1553  int team_size = master_set_numthreads
1554  ? master_set_numthreads
1555  : get__nproc_2(parent_team, master_tid);
1556  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1557  parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1558  OMPT_INVOKER(call_context), return_address);
1559  }
1560  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1561  }
1562 #endif
1563 
1564  master_th->th.th_ident = loc;
1565 
1566 #if OMP_40_ENABLED
1567  if (master_th->th.th_teams_microtask && ap &&
1568  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1569  // AC: This is start of parallel that is nested inside teams construct.
1570  // The team is actual (hot), all workers are ready at the fork barrier.
1571  // No lock needed to initialize the team a bit, then free workers.
1572  parent_team->t.t_ident = loc;
1573  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1574  parent_team->t.t_argc = argc;
1575  argv = (void **)parent_team->t.t_argv;
1576  for (i = argc - 1; i >= 0; --i)
1577 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1578 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1579  *argv++ = va_arg(*ap, void *);
1580 #else
1581  *argv++ = va_arg(ap, void *);
1582 #endif
1583  // Increment our nested depth levels, but not increase the serialization
1584  if (parent_team == master_th->th.th_serial_team) {
1585  // AC: we are in serialized parallel
1586  __kmpc_serialized_parallel(loc, gtid);
1587  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1588  // AC: need this in order enquiry functions work
1589  // correctly, will restore at join time
1590  parent_team->t.t_serialized--;
1591 #if OMPT_SUPPORT
1592  void *dummy;
1593  void **exit_runtime_p;
1594 
1595  ompt_lw_taskteam_t lw_taskteam;
1596 
1597  if (ompt_enabled.enabled) {
1598  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1599  &ompt_parallel_data, return_address);
1600  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1601 
1602  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1603  // don't use lw_taskteam after linking. content was swaped
1604 
1605  /* OMPT implicit task begin */
1606  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1607  if (ompt_enabled.ompt_callback_implicit_task) {
1608  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1609  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1610  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1611  OMPT_CUR_TASK_INFO(master_th)
1612  ->thread_num = __kmp_tid_from_gtid(gtid);
1613  }
1614 
1615  /* OMPT state */
1616  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1617  } else {
1618  exit_runtime_p = &dummy;
1619  }
1620 #endif
1621 
1622  {
1623  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1624  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1625  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1626 #if OMPT_SUPPORT
1627  ,
1628  exit_runtime_p
1629 #endif
1630  );
1631  }
1632 
1633 #if OMPT_SUPPORT
1634  *exit_runtime_p = NULL;
1635  if (ompt_enabled.enabled) {
1636  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1637  if (ompt_enabled.ompt_callback_implicit_task) {
1638  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1639  ompt_scope_end, NULL, implicit_task_data, 1,
1640  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1641  }
1642  __ompt_lw_taskteam_unlink(master_th);
1643 
1644  if (ompt_enabled.ompt_callback_parallel_end) {
1645  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1646  OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1647  OMPT_INVOKER(call_context), return_address);
1648  }
1649  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1650  }
1651 #endif
1652  return TRUE;
1653  }
1654 
1655  parent_team->t.t_pkfn = microtask;
1656  parent_team->t.t_invoke = invoker;
1657  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1658  parent_team->t.t_active_level++;
1659  parent_team->t.t_level++;
1660 #if OMP_50_ENABLED
1661  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1662 #endif
1663 
1664  /* Change number of threads in the team if requested */
1665  if (master_set_numthreads) { // The parallel has num_threads clause
1666  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1667  // AC: only can reduce number of threads dynamically, can't increase
1668  kmp_info_t **other_threads = parent_team->t.t_threads;
1669  parent_team->t.t_nproc = master_set_numthreads;
1670  for (i = 0; i < master_set_numthreads; ++i) {
1671  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1672  }
1673  // Keep extra threads hot in the team for possible next parallels
1674  }
1675  master_th->th.th_set_nproc = 0;
1676  }
1677 
1678 #if USE_DEBUGGER
1679  if (__kmp_debugging) { // Let debugger override number of threads.
1680  int nth = __kmp_omp_num_threads(loc);
1681  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1682  master_set_numthreads = nth;
1683  }
1684  }
1685 #endif
1686 
1687  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1688  "master_th=%p, gtid=%d\n",
1689  root, parent_team, master_th, gtid));
1690  __kmp_internal_fork(loc, gtid, parent_team);
1691  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1692  "master_th=%p, gtid=%d\n",
1693  root, parent_team, master_th, gtid));
1694 
1695  /* Invoke microtask for MASTER thread */
1696  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1697  parent_team->t.t_id, parent_team->t.t_pkfn));
1698 
1699  if (!parent_team->t.t_invoke(gtid)) {
1700  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1701  }
1702  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1703  parent_team->t.t_id, parent_team->t.t_pkfn));
1704  KMP_MB(); /* Flush all pending memory write invalidates. */
1705 
1706  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1707 
1708  return TRUE;
1709  } // Parallel closely nested in teams construct
1710 #endif /* OMP_40_ENABLED */
1711 
1712 #if KMP_DEBUG
1713  if (__kmp_tasking_mode != tskm_immediate_exec) {
1714  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1715  parent_team->t.t_task_team[master_th->th.th_task_state]);
1716  }
1717 #endif
1718 
1719  if (parent_team->t.t_active_level >=
1720  master_th->th.th_current_task->td_icvs.max_active_levels) {
1721  nthreads = 1;
1722  } else {
1723 #if OMP_40_ENABLED
1724  int enter_teams = ((ap == NULL && active_level == 0) ||
1725  (ap && teams_level > 0 && teams_level == level));
1726 #endif
1727  nthreads =
1728  master_set_numthreads
1729  ? master_set_numthreads
1730  : get__nproc_2(
1731  parent_team,
1732  master_tid); // TODO: get nproc directly from current task
1733 
1734  // Check if we need to take forkjoin lock? (no need for serialized
1735  // parallel out of teams construct). This code moved here from
1736  // __kmp_reserve_threads() to speedup nested serialized parallels.
1737  if (nthreads > 1) {
1738  if ((!get__nested(master_th) && (root->r.r_in_parallel
1739 #if OMP_40_ENABLED
1740  && !enter_teams
1741 #endif /* OMP_40_ENABLED */
1742  )) ||
1743  (__kmp_library == library_serial)) {
1744  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1745  " threads\n",
1746  gtid, nthreads));
1747  nthreads = 1;
1748  }
1749  }
1750  if (nthreads > 1) {
1751  /* determine how many new threads we can use */
1752  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1753  nthreads = __kmp_reserve_threads(
1754  root, parent_team, master_tid, nthreads
1755 #if OMP_40_ENABLED
1756  /* AC: If we execute teams from parallel region (on host), then
1757  teams should be created but each can only have 1 thread if
1758  nesting is disabled. If teams called from serial region, then
1759  teams and their threads should be created regardless of the
1760  nesting setting. */
1761  ,
1762  enter_teams
1763 #endif /* OMP_40_ENABLED */
1764  );
1765  if (nthreads == 1) {
1766  // Free lock for single thread execution here; for multi-thread
1767  // execution it will be freed later after team of threads created
1768  // and initialized
1769  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1770  }
1771  }
1772  }
1773  KMP_DEBUG_ASSERT(nthreads > 0);
1774 
1775  // If we temporarily changed the set number of threads then restore it now
1776  master_th->th.th_set_nproc = 0;
1777 
1778  /* create a serialized parallel region? */
1779  if (nthreads == 1) {
1780 /* josh todo: hypothetical question: what do we do for OS X*? */
1781 #if KMP_OS_LINUX && \
1782  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1783  void *args[argc];
1784 #else
1785  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1786 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1787  KMP_ARCH_AARCH64) */
1788 
1789  KA_TRACE(20,
1790  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1791 
1792  __kmpc_serialized_parallel(loc, gtid);
1793 
1794  if (call_context == fork_context_intel) {
1795  /* TODO this sucks, use the compiler itself to pass args! :) */
1796  master_th->th.th_serial_team->t.t_ident = loc;
1797 #if OMP_40_ENABLED
1798  if (!ap) {
1799  // revert change made in __kmpc_serialized_parallel()
1800  master_th->th.th_serial_team->t.t_level--;
1801 // Get args from parent team for teams construct
1802 
1803 #if OMPT_SUPPORT
1804  void *dummy;
1805  void **exit_runtime_p;
1806  ompt_task_info_t *task_info;
1807 
1808  ompt_lw_taskteam_t lw_taskteam;
1809 
1810  if (ompt_enabled.enabled) {
1811  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1812  &ompt_parallel_data, return_address);
1813 
1814  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1815  // don't use lw_taskteam after linking. content was swaped
1816 
1817  task_info = OMPT_CUR_TASK_INFO(master_th);
1818  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1819  if (ompt_enabled.ompt_callback_implicit_task) {
1820  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1821  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1822  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1823  OMPT_CUR_TASK_INFO(master_th)
1824  ->thread_num = __kmp_tid_from_gtid(gtid);
1825  }
1826 
1827  /* OMPT state */
1828  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1829  } else {
1830  exit_runtime_p = &dummy;
1831  }
1832 #endif
1833 
1834  {
1835  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1836  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1837  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1838  parent_team->t.t_argv
1839 #if OMPT_SUPPORT
1840  ,
1841  exit_runtime_p
1842 #endif
1843  );
1844  }
1845 
1846 #if OMPT_SUPPORT
1847  if (ompt_enabled.enabled) {
1848  exit_runtime_p = NULL;
1849  if (ompt_enabled.ompt_callback_implicit_task) {
1850  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1851  ompt_scope_end, NULL, &(task_info->task_data), 1,
1852  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1853  }
1854 
1855  __ompt_lw_taskteam_unlink(master_th);
1856  if (ompt_enabled.ompt_callback_parallel_end) {
1857  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1858  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1859  OMPT_INVOKER(call_context), return_address);
1860  }
1861  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1862  }
1863 #endif
1864  } else if (microtask == (microtask_t)__kmp_teams_master) {
1865  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1866  master_th->th.th_serial_team);
1867  team = master_th->th.th_team;
1868  // team->t.t_pkfn = microtask;
1869  team->t.t_invoke = invoker;
1870  __kmp_alloc_argv_entries(argc, team, TRUE);
1871  team->t.t_argc = argc;
1872  argv = (void **)team->t.t_argv;
1873  if (ap) {
1874  for (i = argc - 1; i >= 0; --i)
1875 // TODO: revert workaround for Intel(R) 64 tracker #96
1876 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1877  *argv++ = va_arg(*ap, void *);
1878 #else
1879  *argv++ = va_arg(ap, void *);
1880 #endif
1881  } else {
1882  for (i = 0; i < argc; ++i)
1883  // Get args from parent team for teams construct
1884  argv[i] = parent_team->t.t_argv[i];
1885  }
1886  // AC: revert change made in __kmpc_serialized_parallel()
1887  // because initial code in teams should have level=0
1888  team->t.t_level--;
1889  // AC: call special invoker for outer "parallel" of teams construct
1890  invoker(gtid);
1891  } else {
1892 #endif /* OMP_40_ENABLED */
1893  argv = args;
1894  for (i = argc - 1; i >= 0; --i)
1895 // TODO: revert workaround for Intel(R) 64 tracker #96
1896 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1897  *argv++ = va_arg(*ap, void *);
1898 #else
1899  *argv++ = va_arg(ap, void *);
1900 #endif
1901  KMP_MB();
1902 
1903 #if OMPT_SUPPORT
1904  void *dummy;
1905  void **exit_runtime_p;
1906  ompt_task_info_t *task_info;
1907 
1908  ompt_lw_taskteam_t lw_taskteam;
1909 
1910  if (ompt_enabled.enabled) {
1911  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1912  &ompt_parallel_data, return_address);
1913  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1914  // don't use lw_taskteam after linking. content was swaped
1915  task_info = OMPT_CUR_TASK_INFO(master_th);
1916  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1917 
1918  /* OMPT implicit task begin */
1919  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1920  if (ompt_enabled.ompt_callback_implicit_task) {
1921  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1922  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1923  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1924  OMPT_CUR_TASK_INFO(master_th)
1925  ->thread_num = __kmp_tid_from_gtid(gtid);
1926  }
1927 
1928  /* OMPT state */
1929  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1930  } else {
1931  exit_runtime_p = &dummy;
1932  }
1933 #endif
1934 
1935  {
1936  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1937  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1938  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1939 #if OMPT_SUPPORT
1940  ,
1941  exit_runtime_p
1942 #endif
1943  );
1944  }
1945 
1946 #if OMPT_SUPPORT
1947  if (ompt_enabled.enabled) {
1948  *exit_runtime_p = NULL;
1949  if (ompt_enabled.ompt_callback_implicit_task) {
1950  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1951  ompt_scope_end, NULL, &(task_info->task_data), 1,
1952  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1953  }
1954 
1955  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1956  __ompt_lw_taskteam_unlink(master_th);
1957  if (ompt_enabled.ompt_callback_parallel_end) {
1958  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1959  &ompt_parallel_data, parent_task_data,
1960  OMPT_INVOKER(call_context), return_address);
1961  }
1962  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1963  }
1964 #endif
1965 #if OMP_40_ENABLED
1966  }
1967 #endif /* OMP_40_ENABLED */
1968  } else if (call_context == fork_context_gnu) {
1969 #if OMPT_SUPPORT
1970  ompt_lw_taskteam_t lwt;
1971  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1972  return_address);
1973 
1974  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1975  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1976 // don't use lw_taskteam after linking. content was swaped
1977 #endif
1978 
1979  // we were called from GNU native code
1980  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1981  return FALSE;
1982  } else {
1983  KMP_ASSERT2(call_context < fork_context_last,
1984  "__kmp_fork_call: unknown fork_context parameter");
1985  }
1986 
1987  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1988  KMP_MB();
1989  return FALSE;
1990  } // if (nthreads == 1)
1991 
1992  // GEH: only modify the executing flag in the case when not serialized
1993  // serialized case is handled in kmpc_serialized_parallel
1994  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1995  "curtask=%p, curtask_max_aclevel=%d\n",
1996  parent_team->t.t_active_level, master_th,
1997  master_th->th.th_current_task,
1998  master_th->th.th_current_task->td_icvs.max_active_levels));
1999  // TODO: GEH - cannot do this assertion because root thread not set up as
2000  // executing
2001  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2002  master_th->th.th_current_task->td_flags.executing = 0;
2003 
2004 #if OMP_40_ENABLED
2005  if (!master_th->th.th_teams_microtask || level > teams_level)
2006 #endif /* OMP_40_ENABLED */
2007  {
2008  /* Increment our nested depth level */
2009  KMP_ATOMIC_INC(&root->r.r_in_parallel);
2010  }
2011 
2012  // See if we need to make a copy of the ICVs.
2013  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2014  if ((level + 1 < __kmp_nested_nth.used) &&
2015  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2016  nthreads_icv = __kmp_nested_nth.nth[level + 1];
2017  } else {
2018  nthreads_icv = 0; // don't update
2019  }
2020 
2021 #if OMP_40_ENABLED
2022  // Figure out the proc_bind_policy for the new team.
2023  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2024  kmp_proc_bind_t proc_bind_icv =
2025  proc_bind_default; // proc_bind_default means don't update
2026  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2027  proc_bind = proc_bind_false;
2028  } else {
2029  if (proc_bind == proc_bind_default) {
2030  // No proc_bind clause specified; use current proc-bind-var for this
2031  // parallel region
2032  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2033  }
2034  /* else: The proc_bind policy was specified explicitly on parallel clause.
2035  This overrides proc-bind-var for this parallel region, but does not
2036  change proc-bind-var. */
2037  // Figure the value of proc-bind-var for the child threads.
2038  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2039  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2040  master_th->th.th_current_task->td_icvs.proc_bind)) {
2041  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2042  }
2043  }
2044 
2045  // Reset for next parallel region
2046  master_th->th.th_set_proc_bind = proc_bind_default;
2047 #endif /* OMP_40_ENABLED */
2048 
2049  if ((nthreads_icv > 0)
2050 #if OMP_40_ENABLED
2051  || (proc_bind_icv != proc_bind_default)
2052 #endif /* OMP_40_ENABLED */
2053  ) {
2054  kmp_internal_control_t new_icvs;
2055  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2056  new_icvs.next = NULL;
2057  if (nthreads_icv > 0) {
2058  new_icvs.nproc = nthreads_icv;
2059  }
2060 
2061 #if OMP_40_ENABLED
2062  if (proc_bind_icv != proc_bind_default) {
2063  new_icvs.proc_bind = proc_bind_icv;
2064  }
2065 #endif /* OMP_40_ENABLED */
2066 
2067  /* allocate a new parallel team */
2068  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2069  team = __kmp_allocate_team(root, nthreads, nthreads,
2070 #if OMPT_SUPPORT
2071  ompt_parallel_data,
2072 #endif
2073 #if OMP_40_ENABLED
2074  proc_bind,
2075 #endif
2076  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2077  } else {
2078  /* allocate a new parallel team */
2079  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2080  team = __kmp_allocate_team(root, nthreads, nthreads,
2081 #if OMPT_SUPPORT
2082  ompt_parallel_data,
2083 #endif
2084 #if OMP_40_ENABLED
2085  proc_bind,
2086 #endif
2087  &master_th->th.th_current_task->td_icvs,
2088  argc USE_NESTED_HOT_ARG(master_th));
2089  }
2090  KF_TRACE(
2091  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2092 
2093  /* setup the new team */
2094  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2095  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2096  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2097  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2098  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2099 #if OMPT_SUPPORT
2100  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2101  return_address);
2102 #endif
2103  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2104 // TODO: parent_team->t.t_level == INT_MAX ???
2105 #if OMP_40_ENABLED
2106  if (!master_th->th.th_teams_microtask || level > teams_level) {
2107 #endif /* OMP_40_ENABLED */
2108  int new_level = parent_team->t.t_level + 1;
2109  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2110  new_level = parent_team->t.t_active_level + 1;
2111  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2112 #if OMP_40_ENABLED
2113  } else {
2114  // AC: Do not increase parallel level at start of the teams construct
2115  int new_level = parent_team->t.t_level;
2116  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2117  new_level = parent_team->t.t_active_level;
2118  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2119  }
2120 #endif /* OMP_40_ENABLED */
2121  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2122  // set master's schedule as new run-time schedule
2123  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2124 
2125 #if OMP_40_ENABLED
2126  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2127 #endif
2128 #if OMP_50_ENABLED
2129  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2130 #endif
2131 
2132  // Update the floating point rounding in the team if required.
2133  propagateFPControl(team);
2134 
2135  if (__kmp_tasking_mode != tskm_immediate_exec) {
2136  // Set master's task team to team's task team. Unless this is hot team, it
2137  // should be NULL.
2138  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2139  parent_team->t.t_task_team[master_th->th.th_task_state]);
2140  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2141  "%p, new task_team %p / team %p\n",
2142  __kmp_gtid_from_thread(master_th),
2143  master_th->th.th_task_team, parent_team,
2144  team->t.t_task_team[master_th->th.th_task_state], team));
2145 
2146  if (active_level || master_th->th.th_task_team) {
2147  // Take a memo of master's task_state
2148  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2149  if (master_th->th.th_task_state_top >=
2150  master_th->th.th_task_state_stack_sz) { // increase size
2151  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2152  kmp_uint8 *old_stack, *new_stack;
2153  kmp_uint32 i;
2154  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2155  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2156  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2157  }
2158  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2159  ++i) { // zero-init rest of stack
2160  new_stack[i] = 0;
2161  }
2162  old_stack = master_th->th.th_task_state_memo_stack;
2163  master_th->th.th_task_state_memo_stack = new_stack;
2164  master_th->th.th_task_state_stack_sz = new_size;
2165  __kmp_free(old_stack);
2166  }
2167  // Store master's task_state on stack
2168  master_th->th
2169  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2170  master_th->th.th_task_state;
2171  master_th->th.th_task_state_top++;
2172 #if KMP_NESTED_HOT_TEAMS
2173  if (master_th->th.th_hot_teams &&
2174  active_level < __kmp_hot_teams_max_level &&
2175  team == master_th->th.th_hot_teams[active_level].hot_team) {
2176  // Restore master's nested state if nested hot team
2177  master_th->th.th_task_state =
2178  master_th->th
2179  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2180  } else {
2181 #endif
2182  master_th->th.th_task_state = 0;
2183 #if KMP_NESTED_HOT_TEAMS
2184  }
2185 #endif
2186  }
2187 #if !KMP_NESTED_HOT_TEAMS
2188  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2189  (team == root->r.r_hot_team));
2190 #endif
2191  }
2192 
2193  KA_TRACE(
2194  20,
2195  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2196  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2197  team->t.t_nproc));
2198  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2199  (team->t.t_master_tid == 0 &&
2200  (team->t.t_parent == root->r.r_root_team ||
2201  team->t.t_parent->t.t_serialized)));
2202  KMP_MB();
2203 
2204  /* now, setup the arguments */
2205  argv = (void **)team->t.t_argv;
2206 #if OMP_40_ENABLED
2207  if (ap) {
2208 #endif /* OMP_40_ENABLED */
2209  for (i = argc - 1; i >= 0; --i) {
2210 // TODO: revert workaround for Intel(R) 64 tracker #96
2211 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2212  void *new_argv = va_arg(*ap, void *);
2213 #else
2214  void *new_argv = va_arg(ap, void *);
2215 #endif
2216  KMP_CHECK_UPDATE(*argv, new_argv);
2217  argv++;
2218  }
2219 #if OMP_40_ENABLED
2220  } else {
2221  for (i = 0; i < argc; ++i) {
2222  // Get args from parent team for teams construct
2223  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2224  }
2225  }
2226 #endif /* OMP_40_ENABLED */
2227 
2228  /* now actually fork the threads */
2229  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2230  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2231  root->r.r_active = TRUE;
2232 
2233  __kmp_fork_team_threads(root, team, master_th, gtid);
2234  __kmp_setup_icv_copy(team, nthreads,
2235  &master_th->th.th_current_task->td_icvs, loc);
2236 
2237 #if OMPT_SUPPORT
2238  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2239 #endif
2240 
2241  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2242 
2243 #if USE_ITT_BUILD
2244  if (team->t.t_active_level == 1 // only report frames at level 1
2245 #if OMP_40_ENABLED
2246  && !master_th->th.th_teams_microtask // not in teams construct
2247 #endif /* OMP_40_ENABLED */
2248  ) {
2249 #if USE_ITT_NOTIFY
2250  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2251  (__kmp_forkjoin_frames_mode == 3 ||
2252  __kmp_forkjoin_frames_mode == 1)) {
2253  kmp_uint64 tmp_time = 0;
2254  if (__itt_get_timestamp_ptr)
2255  tmp_time = __itt_get_timestamp();
2256  // Internal fork - report frame begin
2257  master_th->th.th_frame_time = tmp_time;
2258  if (__kmp_forkjoin_frames_mode == 3)
2259  team->t.t_region_time = tmp_time;
2260  } else
2261 // only one notification scheme (either "submit" or "forking/joined", not both)
2262 #endif /* USE_ITT_NOTIFY */
2263  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2264  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2265  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2266  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2267  }
2268  }
2269 #endif /* USE_ITT_BUILD */
2270 
2271  /* now go on and do the work */
2272  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2273  KMP_MB();
2274  KF_TRACE(10,
2275  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2276  root, team, master_th, gtid));
2277 
2278 #if USE_ITT_BUILD
2279  if (__itt_stack_caller_create_ptr) {
2280  team->t.t_stack_id =
2281  __kmp_itt_stack_caller_create(); // create new stack stitching id
2282  // before entering fork barrier
2283  }
2284 #endif /* USE_ITT_BUILD */
2285 
2286 #if OMP_40_ENABLED
2287  // AC: skip __kmp_internal_fork at teams construct, let only master
2288  // threads execute
2289  if (ap)
2290 #endif /* OMP_40_ENABLED */
2291  {
2292  __kmp_internal_fork(loc, gtid, team);
2293  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2294  "master_th=%p, gtid=%d\n",
2295  root, team, master_th, gtid));
2296  }
2297 
2298  if (call_context == fork_context_gnu) {
2299  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2300  return TRUE;
2301  }
2302 
2303  /* Invoke microtask for MASTER thread */
2304  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2305  team->t.t_id, team->t.t_pkfn));
2306  } // END of timer KMP_fork_call block
2307 
2308  if (!team->t.t_invoke(gtid)) {
2309  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2310  }
2311  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2312  team->t.t_id, team->t.t_pkfn));
2313  KMP_MB(); /* Flush all pending memory write invalidates. */
2314 
2315  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2316 
2317 #if OMPT_SUPPORT
2318  if (ompt_enabled.enabled) {
2319  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2320  }
2321 #endif
2322 
2323  return TRUE;
2324 }
2325 
2326 #if OMPT_SUPPORT
2327 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2328  kmp_team_t *team) {
2329  // restore state outside the region
2330  thread->th.ompt_thread_info.state =
2331  ((team->t.t_serialized) ? ompt_state_work_serial
2332  : ompt_state_work_parallel);
2333 }
2334 
2335 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2336  kmp_team_t *team, ompt_data_t *parallel_data,
2337  fork_context_e fork_context, void *codeptr) {
2338  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2339  if (ompt_enabled.ompt_callback_parallel_end) {
2340  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2341  parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2342  codeptr);
2343  }
2344 
2345  task_info->frame.enter_frame = ompt_data_none;
2346  __kmp_join_restore_state(thread, team);
2347 }
2348 #endif
2349 
2350 void __kmp_join_call(ident_t *loc, int gtid
2351 #if OMPT_SUPPORT
2352  ,
2353  enum fork_context_e fork_context
2354 #endif
2355 #if OMP_40_ENABLED
2356  ,
2357  int exit_teams
2358 #endif /* OMP_40_ENABLED */
2359  ) {
2360  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2361  kmp_team_t *team;
2362  kmp_team_t *parent_team;
2363  kmp_info_t *master_th;
2364  kmp_root_t *root;
2365  int master_active;
2366  int i;
2367 
2368  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2369 
2370  /* setup current data */
2371  master_th = __kmp_threads[gtid];
2372  root = master_th->th.th_root;
2373  team = master_th->th.th_team;
2374  parent_team = team->t.t_parent;
2375 
2376  master_th->th.th_ident = loc;
2377 
2378 #if OMPT_SUPPORT
2379  if (ompt_enabled.enabled) {
2380  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2381  }
2382 #endif
2383 
2384 #if KMP_DEBUG
2385  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2386  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2387  "th_task_team = %p\n",
2388  __kmp_gtid_from_thread(master_th), team,
2389  team->t.t_task_team[master_th->th.th_task_state],
2390  master_th->th.th_task_team));
2391  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2392  team->t.t_task_team[master_th->th.th_task_state]);
2393  }
2394 #endif
2395 
2396  if (team->t.t_serialized) {
2397 #if OMP_40_ENABLED
2398  if (master_th->th.th_teams_microtask) {
2399  // We are in teams construct
2400  int level = team->t.t_level;
2401  int tlevel = master_th->th.th_teams_level;
2402  if (level == tlevel) {
2403  // AC: we haven't incremented it earlier at start of teams construct,
2404  // so do it here - at the end of teams construct
2405  team->t.t_level++;
2406  } else if (level == tlevel + 1) {
2407  // AC: we are exiting parallel inside teams, need to increment
2408  // serialization in order to restore it in the next call to
2409  // __kmpc_end_serialized_parallel
2410  team->t.t_serialized++;
2411  }
2412  }
2413 #endif /* OMP_40_ENABLED */
2414  __kmpc_end_serialized_parallel(loc, gtid);
2415 
2416 #if OMPT_SUPPORT
2417  if (ompt_enabled.enabled) {
2418  __kmp_join_restore_state(master_th, parent_team);
2419  }
2420 #endif
2421 
2422  return;
2423  }
2424 
2425  master_active = team->t.t_master_active;
2426 
2427 #if OMP_40_ENABLED
2428  if (!exit_teams)
2429 #endif /* OMP_40_ENABLED */
2430  {
2431  // AC: No barrier for internal teams at exit from teams construct.
2432  // But there is barrier for external team (league).
2433  __kmp_internal_join(loc, gtid, team);
2434  }
2435 #if OMP_40_ENABLED
2436  else {
2437  master_th->th.th_task_state =
2438  0; // AC: no tasking in teams (out of any parallel)
2439  }
2440 #endif /* OMP_40_ENABLED */
2441 
2442  KMP_MB();
2443 
2444 #if OMPT_SUPPORT
2445  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2446  void *codeptr = team->t.ompt_team_info.master_return_address;
2447 #endif
2448 
2449 #if USE_ITT_BUILD
2450  if (__itt_stack_caller_create_ptr) {
2451  __kmp_itt_stack_caller_destroy(
2452  (__itt_caller)team->t
2453  .t_stack_id); // destroy the stack stitching id after join barrier
2454  }
2455 
2456  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2457  if (team->t.t_active_level == 1
2458 #if OMP_40_ENABLED
2459  && !master_th->th.th_teams_microtask /* not in teams construct */
2460 #endif /* OMP_40_ENABLED */
2461  ) {
2462  master_th->th.th_ident = loc;
2463  // only one notification scheme (either "submit" or "forking/joined", not
2464  // both)
2465  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2466  __kmp_forkjoin_frames_mode == 3)
2467  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2468  master_th->th.th_frame_time, 0, loc,
2469  master_th->th.th_team_nproc, 1);
2470  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2471  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2472  __kmp_itt_region_joined(gtid);
2473  } // active_level == 1
2474 #endif /* USE_ITT_BUILD */
2475 
2476 #if OMP_40_ENABLED
2477  if (master_th->th.th_teams_microtask && !exit_teams &&
2478  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2479  team->t.t_level == master_th->th.th_teams_level + 1) {
2480  // AC: We need to leave the team structure intact at the end of parallel
2481  // inside the teams construct, so that at the next parallel same (hot) team
2482  // works, only adjust nesting levels
2483 
2484  /* Decrement our nested depth level */
2485  team->t.t_level--;
2486  team->t.t_active_level--;
2487  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2488 
2489  /* Restore number of threads in the team if needed */
2490  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2491  int old_num = master_th->th.th_team_nproc;
2492  int new_num = master_th->th.th_teams_size.nth;
2493  kmp_info_t **other_threads = team->t.t_threads;
2494  team->t.t_nproc = new_num;
2495  for (i = 0; i < old_num; ++i) {
2496  other_threads[i]->th.th_team_nproc = new_num;
2497  }
2498  // Adjust states of non-used threads of the team
2499  for (i = old_num; i < new_num; ++i) {
2500  // Re-initialize thread's barrier data.
2501  int b;
2502  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2503  for (b = 0; b < bs_last_barrier; ++b) {
2504  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2505  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2506 #if USE_DEBUGGER
2507  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2508 #endif
2509  }
2510  if (__kmp_tasking_mode != tskm_immediate_exec) {
2511  // Synchronize thread's task state
2512  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2513  }
2514  }
2515  }
2516 
2517 #if OMPT_SUPPORT
2518  if (ompt_enabled.enabled) {
2519  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2520  codeptr);
2521  }
2522 #endif
2523 
2524  return;
2525  }
2526 #endif /* OMP_40_ENABLED */
2527 
2528  /* do cleanup and restore the parent team */
2529  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2530  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2531 
2532  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2533 
2534  /* jc: The following lock has instructions with REL and ACQ semantics,
2535  separating the parallel user code called in this parallel region
2536  from the serial user code called after this function returns. */
2537  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2538 
2539 #if OMP_40_ENABLED
2540  if (!master_th->th.th_teams_microtask ||
2541  team->t.t_level > master_th->th.th_teams_level)
2542 #endif /* OMP_40_ENABLED */
2543  {
2544  /* Decrement our nested depth level */
2545  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2546  }
2547  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2548 
2549 #if OMPT_SUPPORT
2550  if (ompt_enabled.enabled) {
2551  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2552  if (ompt_enabled.ompt_callback_implicit_task) {
2553  int ompt_team_size = team->t.t_nproc;
2554  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2555  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2556  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2557  }
2558 
2559  task_info->frame.exit_frame = ompt_data_none;
2560  task_info->task_data = ompt_data_none;
2561  }
2562 #endif
2563 
2564  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2565  master_th, team));
2566  __kmp_pop_current_task_from_thread(master_th);
2567 
2568 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2569  // Restore master thread's partition.
2570  master_th->th.th_first_place = team->t.t_first_place;
2571  master_th->th.th_last_place = team->t.t_last_place;
2572 #endif /* OMP_40_ENABLED */
2573 #if OMP_50_ENABLED
2574  master_th->th.th_def_allocator = team->t.t_def_allocator;
2575 #endif
2576 
2577  updateHWFPControl(team);
2578 
2579  if (root->r.r_active != master_active)
2580  root->r.r_active = master_active;
2581 
2582  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2583  master_th)); // this will free worker threads
2584 
2585  /* this race was fun to find. make sure the following is in the critical
2586  region otherwise assertions may fail occasionally since the old team may be
2587  reallocated and the hierarchy appears inconsistent. it is actually safe to
2588  run and won't cause any bugs, but will cause those assertion failures. it's
2589  only one deref&assign so might as well put this in the critical region */
2590  master_th->th.th_team = parent_team;
2591  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2592  master_th->th.th_team_master = parent_team->t.t_threads[0];
2593  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2594 
2595  /* restore serialized team, if need be */
2596  if (parent_team->t.t_serialized &&
2597  parent_team != master_th->th.th_serial_team &&
2598  parent_team != root->r.r_root_team) {
2599  __kmp_free_team(root,
2600  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2601  master_th->th.th_serial_team = parent_team;
2602  }
2603 
2604  if (__kmp_tasking_mode != tskm_immediate_exec) {
2605  if (master_th->th.th_task_state_top >
2606  0) { // Restore task state from memo stack
2607  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2608  // Remember master's state if we re-use this nested hot team
2609  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2610  master_th->th.th_task_state;
2611  --master_th->th.th_task_state_top; // pop
2612  // Now restore state at this level
2613  master_th->th.th_task_state =
2614  master_th->th
2615  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2616  }
2617  // Copy the task team from the parent team to the master thread
2618  master_th->th.th_task_team =
2619  parent_team->t.t_task_team[master_th->th.th_task_state];
2620  KA_TRACE(20,
2621  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2622  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2623  parent_team));
2624  }
2625 
2626  // TODO: GEH - cannot do this assertion because root thread not set up as
2627  // executing
2628  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2629  master_th->th.th_current_task->td_flags.executing = 1;
2630 
2631  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2632 
2633 #if OMPT_SUPPORT
2634  if (ompt_enabled.enabled) {
2635  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2636  codeptr);
2637  }
2638 #endif
2639 
2640  KMP_MB();
2641  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2642 }
2643 
2644 /* Check whether we should push an internal control record onto the
2645  serial team stack. If so, do it. */
2646 void __kmp_save_internal_controls(kmp_info_t *thread) {
2647 
2648  if (thread->th.th_team != thread->th.th_serial_team) {
2649  return;
2650  }
2651  if (thread->th.th_team->t.t_serialized > 1) {
2652  int push = 0;
2653 
2654  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2655  push = 1;
2656  } else {
2657  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2658  thread->th.th_team->t.t_serialized) {
2659  push = 1;
2660  }
2661  }
2662  if (push) { /* push a record on the serial team's stack */
2663  kmp_internal_control_t *control =
2664  (kmp_internal_control_t *)__kmp_allocate(
2665  sizeof(kmp_internal_control_t));
2666 
2667  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2668 
2669  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2670 
2671  control->next = thread->th.th_team->t.t_control_stack_top;
2672  thread->th.th_team->t.t_control_stack_top = control;
2673  }
2674  }
2675 }
2676 
2677 /* Changes set_nproc */
2678 void __kmp_set_num_threads(int new_nth, int gtid) {
2679  kmp_info_t *thread;
2680  kmp_root_t *root;
2681 
2682  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2683  KMP_DEBUG_ASSERT(__kmp_init_serial);
2684 
2685  if (new_nth < 1)
2686  new_nth = 1;
2687  else if (new_nth > __kmp_max_nth)
2688  new_nth = __kmp_max_nth;
2689 
2690  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2691  thread = __kmp_threads[gtid];
2692  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2693  return; // nothing to do
2694 
2695  __kmp_save_internal_controls(thread);
2696 
2697  set__nproc(thread, new_nth);
2698 
2699  // If this omp_set_num_threads() call will cause the hot team size to be
2700  // reduced (in the absence of a num_threads clause), then reduce it now,
2701  // rather than waiting for the next parallel region.
2702  root = thread->th.th_root;
2703  if (__kmp_init_parallel && (!root->r.r_active) &&
2704  (root->r.r_hot_team->t.t_nproc > new_nth)
2705 #if KMP_NESTED_HOT_TEAMS
2706  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2707 #endif
2708  ) {
2709  kmp_team_t *hot_team = root->r.r_hot_team;
2710  int f;
2711 
2712  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2713 
2714  // Release the extra threads we don't need any more.
2715  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2716  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2717  if (__kmp_tasking_mode != tskm_immediate_exec) {
2718  // When decreasing team size, threads no longer in the team should unref
2719  // task team.
2720  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2721  }
2722  __kmp_free_thread(hot_team->t.t_threads[f]);
2723  hot_team->t.t_threads[f] = NULL;
2724  }
2725  hot_team->t.t_nproc = new_nth;
2726 #if KMP_NESTED_HOT_TEAMS
2727  if (thread->th.th_hot_teams) {
2728  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2729  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2730  }
2731 #endif
2732 
2733  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2734 
2735  // Update the t_nproc field in the threads that are still active.
2736  for (f = 0; f < new_nth; f++) {
2737  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2738  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2739  }
2740  // Special flag in case omp_set_num_threads() call
2741  hot_team->t.t_size_changed = -1;
2742  }
2743 }
2744 
2745 /* Changes max_active_levels */
2746 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2747  kmp_info_t *thread;
2748 
2749  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2750  "%d = (%d)\n",
2751  gtid, max_active_levels));
2752  KMP_DEBUG_ASSERT(__kmp_init_serial);
2753 
2754  // validate max_active_levels
2755  if (max_active_levels < 0) {
2756  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2757  // We ignore this call if the user has specified a negative value.
2758  // The current setting won't be changed. The last valid setting will be
2759  // used. A warning will be issued (if warnings are allowed as controlled by
2760  // the KMP_WARNINGS env var).
2761  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2762  "max_active_levels for thread %d = (%d)\n",
2763  gtid, max_active_levels));
2764  return;
2765  }
2766  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2767  // it's OK, the max_active_levels is within the valid range: [ 0;
2768  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2769  // We allow a zero value. (implementation defined behavior)
2770  } else {
2771  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2772  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2773  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2774  // Current upper limit is MAX_INT. (implementation defined behavior)
2775  // If the input exceeds the upper limit, we correct the input to be the
2776  // upper limit. (implementation defined behavior)
2777  // Actually, the flow should never get here until we use MAX_INT limit.
2778  }
2779  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2780  "max_active_levels for thread %d = (%d)\n",
2781  gtid, max_active_levels));
2782 
2783  thread = __kmp_threads[gtid];
2784 
2785  __kmp_save_internal_controls(thread);
2786 
2787  set__max_active_levels(thread, max_active_levels);
2788 }
2789 
2790 /* Gets max_active_levels */
2791 int __kmp_get_max_active_levels(int gtid) {
2792  kmp_info_t *thread;
2793 
2794  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2795  KMP_DEBUG_ASSERT(__kmp_init_serial);
2796 
2797  thread = __kmp_threads[gtid];
2798  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2799  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2800  "curtask_maxaclevel=%d\n",
2801  gtid, thread->th.th_current_task,
2802  thread->th.th_current_task->td_icvs.max_active_levels));
2803  return thread->th.th_current_task->td_icvs.max_active_levels;
2804 }
2805 
2806 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2807 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2808  kmp_info_t *thread;
2809  // kmp_team_t *team;
2810 
2811  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2812  gtid, (int)kind, chunk));
2813  KMP_DEBUG_ASSERT(__kmp_init_serial);
2814 
2815  // Check if the kind parameter is valid, correct if needed.
2816  // Valid parameters should fit in one of two intervals - standard or extended:
2817  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2818  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2819  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2820  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2821  // TODO: Hint needs attention in case we change the default schedule.
2822  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2823  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2824  __kmp_msg_null);
2825  kind = kmp_sched_default;
2826  chunk = 0; // ignore chunk value in case of bad kind
2827  }
2828 
2829  thread = __kmp_threads[gtid];
2830 
2831  __kmp_save_internal_controls(thread);
2832 
2833  if (kind < kmp_sched_upper_std) {
2834  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2835  // differ static chunked vs. unchunked: chunk should be invalid to
2836  // indicate unchunked schedule (which is the default)
2837  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2838  } else {
2839  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2840  __kmp_sch_map[kind - kmp_sched_lower - 1];
2841  }
2842  } else {
2843  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2844  // kmp_sched_lower - 2 ];
2845  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2846  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2847  kmp_sched_lower - 2];
2848  }
2849  if (kind == kmp_sched_auto || chunk < 1) {
2850  // ignore parameter chunk for schedule auto
2851  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2852  } else {
2853  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2854  }
2855 }
2856 
2857 /* Gets def_sched_var ICV values */
2858 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2859  kmp_info_t *thread;
2860  enum sched_type th_type;
2861 
2862  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2863  KMP_DEBUG_ASSERT(__kmp_init_serial);
2864 
2865  thread = __kmp_threads[gtid];
2866 
2867  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2868 
2869  switch (th_type) {
2870  case kmp_sch_static:
2871  case kmp_sch_static_greedy:
2872  case kmp_sch_static_balanced:
2873  *kind = kmp_sched_static;
2874  *chunk = 0; // chunk was not set, try to show this fact via zero value
2875  return;
2876  case kmp_sch_static_chunked:
2877  *kind = kmp_sched_static;
2878  break;
2879  case kmp_sch_dynamic_chunked:
2880  *kind = kmp_sched_dynamic;
2881  break;
2883  case kmp_sch_guided_iterative_chunked:
2884  case kmp_sch_guided_analytical_chunked:
2885  *kind = kmp_sched_guided;
2886  break;
2887  case kmp_sch_auto:
2888  *kind = kmp_sched_auto;
2889  break;
2890  case kmp_sch_trapezoidal:
2891  *kind = kmp_sched_trapezoidal;
2892  break;
2893 #if KMP_STATIC_STEAL_ENABLED
2894  case kmp_sch_static_steal:
2895  *kind = kmp_sched_static_steal;
2896  break;
2897 #endif
2898  default:
2899  KMP_FATAL(UnknownSchedulingType, th_type);
2900  }
2901 
2902  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2903 }
2904 
2905 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2906 
2907  int ii, dd;
2908  kmp_team_t *team;
2909  kmp_info_t *thr;
2910 
2911  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2912  KMP_DEBUG_ASSERT(__kmp_init_serial);
2913 
2914  // validate level
2915  if (level == 0)
2916  return 0;
2917  if (level < 0)
2918  return -1;
2919  thr = __kmp_threads[gtid];
2920  team = thr->th.th_team;
2921  ii = team->t.t_level;
2922  if (level > ii)
2923  return -1;
2924 
2925 #if OMP_40_ENABLED
2926  if (thr->th.th_teams_microtask) {
2927  // AC: we are in teams region where multiple nested teams have same level
2928  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2929  if (level <=
2930  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2931  KMP_DEBUG_ASSERT(ii >= tlevel);
2932  // AC: As we need to pass by the teams league, we need to artificially
2933  // increase ii
2934  if (ii == tlevel) {
2935  ii += 2; // three teams have same level
2936  } else {
2937  ii++; // two teams have same level
2938  }
2939  }
2940  }
2941 #endif
2942 
2943  if (ii == level)
2944  return __kmp_tid_from_gtid(gtid);
2945 
2946  dd = team->t.t_serialized;
2947  level++;
2948  while (ii > level) {
2949  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2950  }
2951  if ((team->t.t_serialized) && (!dd)) {
2952  team = team->t.t_parent;
2953  continue;
2954  }
2955  if (ii > level) {
2956  team = team->t.t_parent;
2957  dd = team->t.t_serialized;
2958  ii--;
2959  }
2960  }
2961 
2962  return (dd > 1) ? (0) : (team->t.t_master_tid);
2963 }
2964 
2965 int __kmp_get_team_size(int gtid, int level) {
2966 
2967  int ii, dd;
2968  kmp_team_t *team;
2969  kmp_info_t *thr;
2970 
2971  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2972  KMP_DEBUG_ASSERT(__kmp_init_serial);
2973 
2974  // validate level
2975  if (level == 0)
2976  return 1;
2977  if (level < 0)
2978  return -1;
2979  thr = __kmp_threads[gtid];
2980  team = thr->th.th_team;
2981  ii = team->t.t_level;
2982  if (level > ii)
2983  return -1;
2984 
2985 #if OMP_40_ENABLED
2986  if (thr->th.th_teams_microtask) {
2987  // AC: we are in teams region where multiple nested teams have same level
2988  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2989  if (level <=
2990  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2991  KMP_DEBUG_ASSERT(ii >= tlevel);
2992  // AC: As we need to pass by the teams league, we need to artificially
2993  // increase ii
2994  if (ii == tlevel) {
2995  ii += 2; // three teams have same level
2996  } else {
2997  ii++; // two teams have same level
2998  }
2999  }
3000  }
3001 #endif
3002 
3003  while (ii > level) {
3004  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3005  }
3006  if (team->t.t_serialized && (!dd)) {
3007  team = team->t.t_parent;
3008  continue;
3009  }
3010  if (ii > level) {
3011  team = team->t.t_parent;
3012  ii--;
3013  }
3014  }
3015 
3016  return team->t.t_nproc;
3017 }
3018 
3019 kmp_r_sched_t __kmp_get_schedule_global() {
3020  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3021  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3022  // independently. So one can get the updated schedule here.
3023 
3024  kmp_r_sched_t r_sched;
3025 
3026  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3027  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3028  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3029  // different roots (even in OMP 2.5)
3030  if (__kmp_sched == kmp_sch_static) {
3031  // replace STATIC with more detailed schedule (balanced or greedy)
3032  r_sched.r_sched_type = __kmp_static;
3033  } else if (__kmp_sched == kmp_sch_guided_chunked) {
3034  // replace GUIDED with more detailed schedule (iterative or analytical)
3035  r_sched.r_sched_type = __kmp_guided;
3036  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3037  r_sched.r_sched_type = __kmp_sched;
3038  }
3039 
3040  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3041  // __kmp_chunk may be wrong here (if it was not ever set)
3042  r_sched.chunk = KMP_DEFAULT_CHUNK;
3043  } else {
3044  r_sched.chunk = __kmp_chunk;
3045  }
3046 
3047  return r_sched;
3048 }
3049 
3050 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3051  at least argc number of *t_argv entries for the requested team. */
3052 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3053 
3054  KMP_DEBUG_ASSERT(team);
3055  if (!realloc || argc > team->t.t_max_argc) {
3056 
3057  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3058  "current entries=%d\n",
3059  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3060  /* if previously allocated heap space for args, free them */
3061  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3062  __kmp_free((void *)team->t.t_argv);
3063 
3064  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3065  /* use unused space in the cache line for arguments */
3066  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3067  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3068  "argv entries\n",
3069  team->t.t_id, team->t.t_max_argc));
3070  team->t.t_argv = &team->t.t_inline_argv[0];
3071  if (__kmp_storage_map) {
3072  __kmp_print_storage_map_gtid(
3073  -1, &team->t.t_inline_argv[0],
3074  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3075  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3076  team->t.t_id);
3077  }
3078  } else {
3079  /* allocate space for arguments in the heap */
3080  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3081  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3082  : 2 * argc;
3083  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3084  "argv entries\n",
3085  team->t.t_id, team->t.t_max_argc));
3086  team->t.t_argv =
3087  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3088  if (__kmp_storage_map) {
3089  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3090  &team->t.t_argv[team->t.t_max_argc],
3091  sizeof(void *) * team->t.t_max_argc,
3092  "team_%d.t_argv", team->t.t_id);
3093  }
3094  }
3095  }
3096 }
3097 
3098 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3099  int i;
3100  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3101  team->t.t_threads =
3102  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3103  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3104  sizeof(dispatch_shared_info_t) * num_disp_buff);
3105  team->t.t_dispatch =
3106  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3107  team->t.t_implicit_task_taskdata =
3108  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3109  team->t.t_max_nproc = max_nth;
3110 
3111  /* setup dispatch buffers */
3112  for (i = 0; i < num_disp_buff; ++i) {
3113  team->t.t_disp_buffer[i].buffer_index = i;
3114 #if OMP_45_ENABLED
3115  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3116 #endif
3117  }
3118 }
3119 
3120 static void __kmp_free_team_arrays(kmp_team_t *team) {
3121  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3122  int i;
3123  for (i = 0; i < team->t.t_max_nproc; ++i) {
3124  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3125  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3126  team->t.t_dispatch[i].th_disp_buffer = NULL;
3127  }
3128  }
3129 #if KMP_USE_HIER_SCHED
3130  __kmp_dispatch_free_hierarchies(team);
3131 #endif
3132  __kmp_free(team->t.t_threads);
3133  __kmp_free(team->t.t_disp_buffer);
3134  __kmp_free(team->t.t_dispatch);
3135  __kmp_free(team->t.t_implicit_task_taskdata);
3136  team->t.t_threads = NULL;
3137  team->t.t_disp_buffer = NULL;
3138  team->t.t_dispatch = NULL;
3139  team->t.t_implicit_task_taskdata = 0;
3140 }
3141 
3142 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3143  kmp_info_t **oldThreads = team->t.t_threads;
3144 
3145  __kmp_free(team->t.t_disp_buffer);
3146  __kmp_free(team->t.t_dispatch);
3147  __kmp_free(team->t.t_implicit_task_taskdata);
3148  __kmp_allocate_team_arrays(team, max_nth);
3149 
3150  KMP_MEMCPY(team->t.t_threads, oldThreads,
3151  team->t.t_nproc * sizeof(kmp_info_t *));
3152 
3153  __kmp_free(oldThreads);
3154 }
3155 
3156 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3157 
3158  kmp_r_sched_t r_sched =
3159  __kmp_get_schedule_global(); // get current state of scheduling globals
3160 
3161 #if OMP_40_ENABLED
3162  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3163 #endif /* OMP_40_ENABLED */
3164 
3165  kmp_internal_control_t g_icvs = {
3166  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3167  (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3168  // for nested parallelism (per thread)
3169  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3170  // adjustment of threads (per thread)
3171  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3172  // whether blocktime is explicitly set
3173  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3174 #if KMP_USE_MONITOR
3175  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3176 // intervals
3177 #endif
3178  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3179  // next parallel region (per thread)
3180  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3181  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3182  // for max_active_levels
3183  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3184 // {sched,chunk} pair
3185 #if OMP_40_ENABLED
3186  __kmp_nested_proc_bind.bind_types[0],
3187  __kmp_default_device,
3188 #endif /* OMP_40_ENABLED */
3189  NULL // struct kmp_internal_control *next;
3190  };
3191 
3192  return g_icvs;
3193 }
3194 
3195 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3196 
3197  kmp_internal_control_t gx_icvs;
3198  gx_icvs.serial_nesting_level =
3199  0; // probably =team->t.t_serial like in save_inter_controls
3200  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3201  gx_icvs.next = NULL;
3202 
3203  return gx_icvs;
3204 }
3205 
3206 static void __kmp_initialize_root(kmp_root_t *root) {
3207  int f;
3208  kmp_team_t *root_team;
3209  kmp_team_t *hot_team;
3210  int hot_team_max_nth;
3211  kmp_r_sched_t r_sched =
3212  __kmp_get_schedule_global(); // get current state of scheduling globals
3213  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3214  KMP_DEBUG_ASSERT(root);
3215  KMP_ASSERT(!root->r.r_begin);
3216 
3217  /* setup the root state structure */
3218  __kmp_init_lock(&root->r.r_begin_lock);
3219  root->r.r_begin = FALSE;
3220  root->r.r_active = FALSE;
3221  root->r.r_in_parallel = 0;
3222  root->r.r_blocktime = __kmp_dflt_blocktime;
3223  root->r.r_nested = __kmp_dflt_nested;
3224  root->r.r_cg_nthreads = 1;
3225 
3226  /* setup the root team for this task */
3227  /* allocate the root team structure */
3228  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3229 
3230  root_team =
3231  __kmp_allocate_team(root,
3232  1, // new_nproc
3233  1, // max_nproc
3234 #if OMPT_SUPPORT
3235  ompt_data_none, // root parallel id
3236 #endif
3237 #if OMP_40_ENABLED
3238  __kmp_nested_proc_bind.bind_types[0],
3239 #endif
3240  &r_icvs,
3241  0 // argc
3242  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3243  );
3244 #if USE_DEBUGGER
3245  // Non-NULL value should be assigned to make the debugger display the root
3246  // team.
3247  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3248 #endif
3249 
3250  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3251 
3252  root->r.r_root_team = root_team;
3253  root_team->t.t_control_stack_top = NULL;
3254 
3255  /* initialize root team */
3256  root_team->t.t_threads[0] = NULL;
3257  root_team->t.t_nproc = 1;
3258  root_team->t.t_serialized = 1;
3259  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3260  root_team->t.t_sched.sched = r_sched.sched;
3261  KA_TRACE(
3262  20,
3263  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3264  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3265 
3266  /* setup the hot team for this task */
3267  /* allocate the hot team structure */
3268  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3269 
3270  hot_team =
3271  __kmp_allocate_team(root,
3272  1, // new_nproc
3273  __kmp_dflt_team_nth_ub * 2, // max_nproc
3274 #if OMPT_SUPPORT
3275  ompt_data_none, // root parallel id
3276 #endif
3277 #if OMP_40_ENABLED
3278  __kmp_nested_proc_bind.bind_types[0],
3279 #endif
3280  &r_icvs,
3281  0 // argc
3282  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3283  );
3284  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3285 
3286  root->r.r_hot_team = hot_team;
3287  root_team->t.t_control_stack_top = NULL;
3288 
3289  /* first-time initialization */
3290  hot_team->t.t_parent = root_team;
3291 
3292  /* initialize hot team */
3293  hot_team_max_nth = hot_team->t.t_max_nproc;
3294  for (f = 0; f < hot_team_max_nth; ++f) {
3295  hot_team->t.t_threads[f] = NULL;
3296  }
3297  hot_team->t.t_nproc = 1;
3298  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3299  hot_team->t.t_sched.sched = r_sched.sched;
3300  hot_team->t.t_size_changed = 0;
3301 }
3302 
3303 #ifdef KMP_DEBUG
3304 
3305 typedef struct kmp_team_list_item {
3306  kmp_team_p const *entry;
3307  struct kmp_team_list_item *next;
3308 } kmp_team_list_item_t;
3309 typedef kmp_team_list_item_t *kmp_team_list_t;
3310 
3311 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3312  kmp_team_list_t list, // List of teams.
3313  kmp_team_p const *team // Team to add.
3314  ) {
3315 
3316  // List must terminate with item where both entry and next are NULL.
3317  // Team is added to the list only once.
3318  // List is sorted in ascending order by team id.
3319  // Team id is *not* a key.
3320 
3321  kmp_team_list_t l;
3322 
3323  KMP_DEBUG_ASSERT(list != NULL);
3324  if (team == NULL) {
3325  return;
3326  }
3327 
3328  __kmp_print_structure_team_accum(list, team->t.t_parent);
3329  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3330 
3331  // Search list for the team.
3332  l = list;
3333  while (l->next != NULL && l->entry != team) {
3334  l = l->next;
3335  }
3336  if (l->next != NULL) {
3337  return; // Team has been added before, exit.
3338  }
3339 
3340  // Team is not found. Search list again for insertion point.
3341  l = list;
3342  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3343  l = l->next;
3344  }
3345 
3346  // Insert team.
3347  {
3348  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3349  sizeof(kmp_team_list_item_t));
3350  *item = *l;
3351  l->entry = team;
3352  l->next = item;
3353  }
3354 }
3355 
3356 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3357 
3358  ) {
3359  __kmp_printf("%s", title);
3360  if (team != NULL) {
3361  __kmp_printf("%2x %p\n", team->t.t_id, team);
3362  } else {
3363  __kmp_printf(" - (nil)\n");
3364  }
3365 }
3366 
3367 static void __kmp_print_structure_thread(char const *title,
3368  kmp_info_p const *thread) {
3369  __kmp_printf("%s", title);
3370  if (thread != NULL) {
3371  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3372  } else {
3373  __kmp_printf(" - (nil)\n");
3374  }
3375 }
3376 
3377 void __kmp_print_structure(void) {
3378 
3379  kmp_team_list_t list;
3380 
3381  // Initialize list of teams.
3382  list =
3383  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3384  list->entry = NULL;
3385  list->next = NULL;
3386 
3387  __kmp_printf("\n------------------------------\nGlobal Thread "
3388  "Table\n------------------------------\n");
3389  {
3390  int gtid;
3391  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3392  __kmp_printf("%2d", gtid);
3393  if (__kmp_threads != NULL) {
3394  __kmp_printf(" %p", __kmp_threads[gtid]);
3395  }
3396  if (__kmp_root != NULL) {
3397  __kmp_printf(" %p", __kmp_root[gtid]);
3398  }
3399  __kmp_printf("\n");
3400  }
3401  }
3402 
3403  // Print out __kmp_threads array.
3404  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3405  "----------\n");
3406  if (__kmp_threads != NULL) {
3407  int gtid;
3408  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3409  kmp_info_t const *thread = __kmp_threads[gtid];
3410  if (thread != NULL) {
3411  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3412  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3413  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3414  __kmp_print_structure_team(" Serial Team: ",
3415  thread->th.th_serial_team);
3416  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3417  __kmp_print_structure_thread(" Master: ",
3418  thread->th.th_team_master);
3419  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3420  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3421 #if OMP_40_ENABLED
3422  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3423 #endif
3424  __kmp_print_structure_thread(" Next in pool: ",
3425  thread->th.th_next_pool);
3426  __kmp_printf("\n");
3427  __kmp_print_structure_team_accum(list, thread->th.th_team);
3428  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3429  }
3430  }
3431  } else {
3432  __kmp_printf("Threads array is not allocated.\n");
3433  }
3434 
3435  // Print out __kmp_root array.
3436  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3437  "--------\n");
3438  if (__kmp_root != NULL) {
3439  int gtid;
3440  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3441  kmp_root_t const *root = __kmp_root[gtid];
3442  if (root != NULL) {
3443  __kmp_printf("GTID %2d %p:\n", gtid, root);
3444  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3445  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3446  __kmp_print_structure_thread(" Uber Thread: ",
3447  root->r.r_uber_thread);
3448  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3449  __kmp_printf(" Nested?: %2d\n", root->r.r_nested);
3450  __kmp_printf(" In Parallel: %2d\n",
3451  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3452  __kmp_printf("\n");
3453  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3454  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3455  }
3456  }
3457  } else {
3458  __kmp_printf("Ubers array is not allocated.\n");
3459  }
3460 
3461  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3462  "--------\n");
3463  while (list->next != NULL) {
3464  kmp_team_p const *team = list->entry;
3465  int i;
3466  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3467  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3468  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3469  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3470  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3471  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3472  for (i = 0; i < team->t.t_nproc; ++i) {
3473  __kmp_printf(" Thread %2d: ", i);
3474  __kmp_print_structure_thread("", team->t.t_threads[i]);
3475  }
3476  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3477  __kmp_printf("\n");
3478  list = list->next;
3479  }
3480 
3481  // Print out __kmp_thread_pool and __kmp_team_pool.
3482  __kmp_printf("\n------------------------------\nPools\n----------------------"
3483  "--------\n");
3484  __kmp_print_structure_thread("Thread pool: ",
3485  CCAST(kmp_info_t *, __kmp_thread_pool));
3486  __kmp_print_structure_team("Team pool: ",
3487  CCAST(kmp_team_t *, __kmp_team_pool));
3488  __kmp_printf("\n");
3489 
3490  // Free team list.
3491  while (list != NULL) {
3492  kmp_team_list_item_t *item = list;
3493  list = list->next;
3494  KMP_INTERNAL_FREE(item);
3495  }
3496 }
3497 
3498 #endif
3499 
3500 //---------------------------------------------------------------------------
3501 // Stuff for per-thread fast random number generator
3502 // Table of primes
3503 static const unsigned __kmp_primes[] = {
3504  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3505  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3506  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3507  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3508  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3509  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3510  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3511  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3512  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3513  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3514  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3515 
3516 //---------------------------------------------------------------------------
3517 // __kmp_get_random: Get a random number using a linear congruential method.
3518 unsigned short __kmp_get_random(kmp_info_t *thread) {
3519  unsigned x = thread->th.th_x;
3520  unsigned short r = x >> 16;
3521 
3522  thread->th.th_x = x * thread->th.th_a + 1;
3523 
3524  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3525  thread->th.th_info.ds.ds_tid, r));
3526 
3527  return r;
3528 }
3529 //--------------------------------------------------------
3530 // __kmp_init_random: Initialize a random number generator
3531 void __kmp_init_random(kmp_info_t *thread) {
3532  unsigned seed = thread->th.th_info.ds.ds_tid;
3533 
3534  thread->th.th_a =
3535  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3536  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3537  KA_TRACE(30,
3538  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3539 }
3540 
3541 #if KMP_OS_WINDOWS
3542 /* reclaim array entries for root threads that are already dead, returns number
3543  * reclaimed */
3544 static int __kmp_reclaim_dead_roots(void) {
3545  int i, r = 0;
3546 
3547  for (i = 0; i < __kmp_threads_capacity; ++i) {
3548  if (KMP_UBER_GTID(i) &&
3549  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3550  !__kmp_root[i]
3551  ->r.r_active) { // AC: reclaim only roots died in non-active state
3552  r += __kmp_unregister_root_other_thread(i);
3553  }
3554  }
3555  return r;
3556 }
3557 #endif
3558 
3559 /* This function attempts to create free entries in __kmp_threads and
3560  __kmp_root, and returns the number of free entries generated.
3561 
3562  For Windows* OS static library, the first mechanism used is to reclaim array
3563  entries for root threads that are already dead.
3564 
3565  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3566  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3567  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3568  threadprivate cache array has been created. Synchronization with
3569  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3570 
3571  After any dead root reclamation, if the clipping value allows array expansion
3572  to result in the generation of a total of nNeed free slots, the function does
3573  that expansion. If not, nothing is done beyond the possible initial root
3574  thread reclamation.
3575 
3576  If any argument is negative, the behavior is undefined. */
3577 static int __kmp_expand_threads(int nNeed) {
3578  int added = 0;
3579  int minimumRequiredCapacity;
3580  int newCapacity;
3581  kmp_info_t **newThreads;
3582  kmp_root_t **newRoot;
3583 
3584 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3585 // resizing __kmp_threads does not need additional protection if foreign
3586 // threads are present
3587 
3588 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3589  /* only for Windows static library */
3590  /* reclaim array entries for root threads that are already dead */
3591  added = __kmp_reclaim_dead_roots();
3592 
3593  if (nNeed) {
3594  nNeed -= added;
3595  if (nNeed < 0)
3596  nNeed = 0;
3597  }
3598 #endif
3599  if (nNeed <= 0)
3600  return added;
3601 
3602  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3603  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3604  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3605  // > __kmp_max_nth in one of two ways:
3606  //
3607  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3608  // may not be resused by another thread, so we may need to increase
3609  // __kmp_threads_capacity to __kmp_max_nth + 1.
3610  //
3611  // 2) New foreign root(s) are encountered. We always register new foreign
3612  // roots. This may cause a smaller # of threads to be allocated at
3613  // subsequent parallel regions, but the worker threads hang around (and
3614  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3615  //
3616  // Anyway, that is the reason for moving the check to see if
3617  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3618  // instead of having it performed here. -BB
3619 
3620  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3621 
3622  /* compute expansion headroom to check if we can expand */
3623  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3624  /* possible expansion too small -- give up */
3625  return added;
3626  }
3627  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3628 
3629  newCapacity = __kmp_threads_capacity;
3630  do {
3631  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3632  : __kmp_sys_max_nth;
3633  } while (newCapacity < minimumRequiredCapacity);
3634  newThreads = (kmp_info_t **)__kmp_allocate(
3635  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3636  newRoot =
3637  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3638  KMP_MEMCPY(newThreads, __kmp_threads,
3639  __kmp_threads_capacity * sizeof(kmp_info_t *));
3640  KMP_MEMCPY(newRoot, __kmp_root,
3641  __kmp_threads_capacity * sizeof(kmp_root_t *));
3642 
3643  kmp_info_t **temp_threads = __kmp_threads;
3644  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3645  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3646  __kmp_free(temp_threads);
3647  added += newCapacity - __kmp_threads_capacity;
3648  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3649 
3650  if (newCapacity > __kmp_tp_capacity) {
3651  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3652  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3653  __kmp_threadprivate_resize_cache(newCapacity);
3654  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3655  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3656  }
3657  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3658  }
3659 
3660  return added;
3661 }
3662 
3663 /* Register the current thread as a root thread and obtain our gtid. We must
3664  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3665  thread that calls from __kmp_do_serial_initialize() */
3666 int __kmp_register_root(int initial_thread) {
3667  kmp_info_t *root_thread;
3668  kmp_root_t *root;
3669  int gtid;
3670  int capacity;
3671  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3672  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3673  KMP_MB();
3674 
3675  /* 2007-03-02:
3676  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3677  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3678  work as expected -- it may return false (that means there is at least one
3679  empty slot in __kmp_threads array), but it is possible the only free slot
3680  is #0, which is reserved for initial thread and so cannot be used for this
3681  one. Following code workarounds this bug.
3682 
3683  However, right solution seems to be not reserving slot #0 for initial
3684  thread because:
3685  (1) there is no magic in slot #0,
3686  (2) we cannot detect initial thread reliably (the first thread which does
3687  serial initialization may be not a real initial thread).
3688  */
3689  capacity = __kmp_threads_capacity;
3690  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3691  --capacity;
3692  }
3693 
3694  /* see if there are too many threads */
3695  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3696  if (__kmp_tp_cached) {
3697  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3698  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3699  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3700  } else {
3701  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3702  __kmp_msg_null);
3703  }
3704  }
3705 
3706  /* find an available thread slot */
3707  /* Don't reassign the zero slot since we need that to only be used by initial
3708  thread */
3709  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3710  gtid++)
3711  ;
3712  KA_TRACE(1,
3713  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3714  KMP_ASSERT(gtid < __kmp_threads_capacity);
3715 
3716  /* update global accounting */
3717  __kmp_all_nth++;
3718  TCW_4(__kmp_nth, __kmp_nth + 1);
3719 
3720  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3721  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3722  if (__kmp_adjust_gtid_mode) {
3723  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3724  if (TCR_4(__kmp_gtid_mode) != 2) {
3725  TCW_4(__kmp_gtid_mode, 2);
3726  }
3727  } else {
3728  if (TCR_4(__kmp_gtid_mode) != 1) {
3729  TCW_4(__kmp_gtid_mode, 1);
3730  }
3731  }
3732  }
3733 
3734 #ifdef KMP_ADJUST_BLOCKTIME
3735  /* Adjust blocktime to zero if necessary */
3736  /* Middle initialization might not have occurred yet */
3737  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3738  if (__kmp_nth > __kmp_avail_proc) {
3739  __kmp_zero_bt = TRUE;
3740  }
3741  }
3742 #endif /* KMP_ADJUST_BLOCKTIME */
3743 
3744  /* setup this new hierarchy */
3745  if (!(root = __kmp_root[gtid])) {
3746  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3747  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3748  }
3749 
3750 #if KMP_STATS_ENABLED
3751  // Initialize stats as soon as possible (right after gtid assignment).
3752  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3753  __kmp_stats_thread_ptr->startLife();
3754  KMP_SET_THREAD_STATE(SERIAL_REGION);
3755  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3756 #endif
3757  __kmp_initialize_root(root);
3758 
3759  /* setup new root thread structure */
3760  if (root->r.r_uber_thread) {
3761  root_thread = root->r.r_uber_thread;
3762  } else {
3763  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3764  if (__kmp_storage_map) {
3765  __kmp_print_thread_storage_map(root_thread, gtid);
3766  }
3767  root_thread->th.th_info.ds.ds_gtid = gtid;
3768 #if OMPT_SUPPORT
3769  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3770 #endif
3771  root_thread->th.th_root = root;
3772  if (__kmp_env_consistency_check) {
3773  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3774  }
3775 #if USE_FAST_MEMORY
3776  __kmp_initialize_fast_memory(root_thread);
3777 #endif /* USE_FAST_MEMORY */
3778 
3779 #if KMP_USE_BGET
3780  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3781  __kmp_initialize_bget(root_thread);
3782 #endif
3783  __kmp_init_random(root_thread); // Initialize random number generator
3784  }
3785 
3786  /* setup the serial team held in reserve by the root thread */
3787  if (!root_thread->th.th_serial_team) {
3788  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3789  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3790  root_thread->th.th_serial_team =
3791  __kmp_allocate_team(root, 1, 1,
3792 #if OMPT_SUPPORT
3793  ompt_data_none, // root parallel id
3794 #endif
3795 #if OMP_40_ENABLED
3796  proc_bind_default,
3797 #endif
3798  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3799  }
3800  KMP_ASSERT(root_thread->th.th_serial_team);
3801  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3802  root_thread->th.th_serial_team));
3803 
3804  /* drop root_thread into place */
3805  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3806 
3807  root->r.r_root_team->t.t_threads[0] = root_thread;
3808  root->r.r_hot_team->t.t_threads[0] = root_thread;
3809  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3810  // AC: the team created in reserve, not for execution (it is unused for now).
3811  root_thread->th.th_serial_team->t.t_serialized = 0;
3812  root->r.r_uber_thread = root_thread;
3813 
3814  /* initialize the thread, get it ready to go */
3815  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3816  TCW_4(__kmp_init_gtid, TRUE);
3817 
3818  /* prepare the master thread for get_gtid() */
3819  __kmp_gtid_set_specific(gtid);
3820 
3821 #if USE_ITT_BUILD
3822  __kmp_itt_thread_name(gtid);
3823 #endif /* USE_ITT_BUILD */
3824 
3825 #ifdef KMP_TDATA_GTID
3826  __kmp_gtid = gtid;
3827 #endif
3828  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3829  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3830 
3831  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3832  "plain=%u\n",
3833  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3834  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3835  KMP_INIT_BARRIER_STATE));
3836  { // Initialize barrier data.
3837  int b;
3838  for (b = 0; b < bs_last_barrier; ++b) {
3839  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3840 #if USE_DEBUGGER
3841  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3842 #endif
3843  }
3844  }
3845  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3846  KMP_INIT_BARRIER_STATE);
3847 
3848 #if KMP_AFFINITY_SUPPORTED
3849 #if OMP_40_ENABLED
3850  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3851  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3852  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3853  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3854 #endif
3855  if (TCR_4(__kmp_init_middle)) {
3856  __kmp_affinity_set_init_mask(gtid, TRUE);
3857  }
3858 #endif /* KMP_AFFINITY_SUPPORTED */
3859 #if OMP_50_ENABLED
3860  root_thread->th.th_def_allocator = __kmp_def_allocator;
3861  root_thread->th.th_prev_level = 0;
3862  root_thread->th.th_prev_num_threads = 1;
3863 #endif
3864 
3865  __kmp_root_counter++;
3866 
3867 #if OMPT_SUPPORT
3868  if (!initial_thread && ompt_enabled.enabled) {
3869 
3870  kmp_info_t *root_thread = ompt_get_thread();
3871 
3872  ompt_set_thread_state(root_thread, ompt_state_overhead);
3873 
3874  if (ompt_enabled.ompt_callback_thread_begin) {
3875  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3876  ompt_thread_initial, __ompt_get_thread_data_internal());
3877  }
3878  ompt_data_t *task_data;
3879  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3880  if (ompt_enabled.ompt_callback_task_create) {
3881  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3882  NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3883  // initial task has nothing to return to
3884  }
3885 
3886  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3887  }
3888 #endif
3889 
3890  KMP_MB();
3891  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3892 
3893  return gtid;
3894 }
3895 
3896 #if KMP_NESTED_HOT_TEAMS
3897 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3898  const int max_level) {
3899  int i, n, nth;
3900  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3901  if (!hot_teams || !hot_teams[level].hot_team) {
3902  return 0;
3903  }
3904  KMP_DEBUG_ASSERT(level < max_level);
3905  kmp_team_t *team = hot_teams[level].hot_team;
3906  nth = hot_teams[level].hot_team_nth;
3907  n = nth - 1; // master is not freed
3908  if (level < max_level - 1) {
3909  for (i = 0; i < nth; ++i) {
3910  kmp_info_t *th = team->t.t_threads[i];
3911  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3912  if (i > 0 && th->th.th_hot_teams) {
3913  __kmp_free(th->th.th_hot_teams);
3914  th->th.th_hot_teams = NULL;
3915  }
3916  }
3917  }
3918  __kmp_free_team(root, team, NULL);
3919  return n;
3920 }
3921 #endif
3922 
3923 // Resets a root thread and clear its root and hot teams.
3924 // Returns the number of __kmp_threads entries directly and indirectly freed.
3925 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3926  kmp_team_t *root_team = root->r.r_root_team;
3927  kmp_team_t *hot_team = root->r.r_hot_team;
3928  int n = hot_team->t.t_nproc;
3929  int i;
3930 
3931  KMP_DEBUG_ASSERT(!root->r.r_active);
3932 
3933  root->r.r_root_team = NULL;
3934  root->r.r_hot_team = NULL;
3935  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3936  // before call to __kmp_free_team().
3937  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3938 #if KMP_NESTED_HOT_TEAMS
3939  if (__kmp_hot_teams_max_level >
3940  0) { // need to free nested hot teams and their threads if any
3941  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3942  kmp_info_t *th = hot_team->t.t_threads[i];
3943  if (__kmp_hot_teams_max_level > 1) {
3944  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3945  }
3946  if (th->th.th_hot_teams) {
3947  __kmp_free(th->th.th_hot_teams);
3948  th->th.th_hot_teams = NULL;
3949  }
3950  }
3951  }
3952 #endif
3953  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3954 
3955  // Before we can reap the thread, we need to make certain that all other
3956  // threads in the teams that had this root as ancestor have stopped trying to
3957  // steal tasks.
3958  if (__kmp_tasking_mode != tskm_immediate_exec) {
3959  __kmp_wait_to_unref_task_teams();
3960  }
3961 
3962 #if KMP_OS_WINDOWS
3963  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3964  KA_TRACE(
3965  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3966  "\n",
3967  (LPVOID) & (root->r.r_uber_thread->th),
3968  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3969  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3970 #endif /* KMP_OS_WINDOWS */
3971 
3972 #if OMPT_SUPPORT
3973  if (ompt_enabled.ompt_callback_thread_end) {
3974  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3975  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3976  }
3977 #endif
3978 
3979  TCW_4(__kmp_nth,
3980  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3981  root->r.r_cg_nthreads--;
3982 
3983  __kmp_reap_thread(root->r.r_uber_thread, 1);
3984 
3985  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3986  // of freeing.
3987  root->r.r_uber_thread = NULL;
3988  /* mark root as no longer in use */
3989  root->r.r_begin = FALSE;
3990 
3991  return n;
3992 }
3993 
3994 void __kmp_unregister_root_current_thread(int gtid) {
3995  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3996  /* this lock should be ok, since unregister_root_current_thread is never
3997  called during an abort, only during a normal close. furthermore, if you
3998  have the forkjoin lock, you should never try to get the initz lock */
3999  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4000  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4001  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4002  "exiting T#%d\n",
4003  gtid));
4004  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4005  return;
4006  }
4007  kmp_root_t *root = __kmp_root[gtid];
4008 
4009  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4010  KMP_ASSERT(KMP_UBER_GTID(gtid));
4011  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4012  KMP_ASSERT(root->r.r_active == FALSE);
4013 
4014  KMP_MB();
4015 
4016 #if OMP_45_ENABLED
4017  kmp_info_t *thread = __kmp_threads[gtid];
4018  kmp_team_t *team = thread->th.th_team;
4019  kmp_task_team_t *task_team = thread->th.th_task_team;
4020 
4021  // we need to wait for the proxy tasks before finishing the thread
4022  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4023 #if OMPT_SUPPORT
4024  // the runtime is shutting down so we won't report any events
4025  thread->th.ompt_thread_info.state = ompt_state_undefined;
4026 #endif
4027  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4028  }
4029 #endif
4030 
4031  __kmp_reset_root(gtid, root);
4032 
4033  /* free up this thread slot */
4034  __kmp_gtid_set_specific(KMP_GTID_DNE);
4035 #ifdef KMP_TDATA_GTID
4036  __kmp_gtid = KMP_GTID_DNE;
4037 #endif
4038 
4039  KMP_MB();
4040  KC_TRACE(10,
4041  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4042 
4043  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4044 }
4045 
4046 #if KMP_OS_WINDOWS
4047 /* __kmp_forkjoin_lock must be already held
4048  Unregisters a root thread that is not the current thread. Returns the number
4049  of __kmp_threads entries freed as a result. */
4050 static int __kmp_unregister_root_other_thread(int gtid) {
4051  kmp_root_t *root = __kmp_root[gtid];
4052  int r;
4053 
4054  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4055  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4056  KMP_ASSERT(KMP_UBER_GTID(gtid));
4057  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4058  KMP_ASSERT(root->r.r_active == FALSE);
4059 
4060  r = __kmp_reset_root(gtid, root);
4061  KC_TRACE(10,
4062  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4063  return r;
4064 }
4065 #endif
4066 
4067 #if KMP_DEBUG
4068 void __kmp_task_info() {
4069 
4070  kmp_int32 gtid = __kmp_entry_gtid();
4071  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4072  kmp_info_t *this_thr = __kmp_threads[gtid];
4073  kmp_team_t *steam = this_thr->th.th_serial_team;
4074  kmp_team_t *team = this_thr->th.th_team;
4075 
4076  __kmp_printf(
4077  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4078  "ptask=%p\n",
4079  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4080  team->t.t_implicit_task_taskdata[tid].td_parent);
4081 }
4082 #endif // KMP_DEBUG
4083 
4084 /* TODO optimize with one big memclr, take out what isn't needed, split
4085  responsibility to workers as much as possible, and delay initialization of
4086  features as much as possible */
4087 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4088  int tid, int gtid) {
4089  /* this_thr->th.th_info.ds.ds_gtid is setup in
4090  kmp_allocate_thread/create_worker.
4091  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4092  kmp_info_t *master = team->t.t_threads[0];
4093  KMP_DEBUG_ASSERT(this_thr != NULL);
4094  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4095  KMP_DEBUG_ASSERT(team);
4096  KMP_DEBUG_ASSERT(team->t.t_threads);
4097  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4098  KMP_DEBUG_ASSERT(master);
4099  KMP_DEBUG_ASSERT(master->th.th_root);
4100 
4101  KMP_MB();
4102 
4103  TCW_SYNC_PTR(this_thr->th.th_team, team);
4104 
4105  this_thr->th.th_info.ds.ds_tid = tid;
4106  this_thr->th.th_set_nproc = 0;
4107  if (__kmp_tasking_mode != tskm_immediate_exec)
4108  // When tasking is possible, threads are not safe to reap until they are
4109  // done tasking; this will be set when tasking code is exited in wait
4110  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4111  else // no tasking --> always safe to reap
4112  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4113 #if OMP_40_ENABLED
4114  this_thr->th.th_set_proc_bind = proc_bind_default;
4115 #if KMP_AFFINITY_SUPPORTED
4116  this_thr->th.th_new_place = this_thr->th.th_current_place;
4117 #endif
4118 #endif
4119  this_thr->th.th_root = master->th.th_root;
4120 
4121  /* setup the thread's cache of the team structure */
4122  this_thr->th.th_team_nproc = team->t.t_nproc;
4123  this_thr->th.th_team_master = master;
4124  this_thr->th.th_team_serialized = team->t.t_serialized;
4125  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4126 
4127  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4128 
4129  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4130  tid, gtid, this_thr, this_thr->th.th_current_task));
4131 
4132  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4133  team, tid, TRUE);
4134 
4135  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4136  tid, gtid, this_thr, this_thr->th.th_current_task));
4137  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4138  // __kmp_initialize_team()?
4139 
4140  /* TODO no worksharing in speculative threads */
4141  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4142 
4143  this_thr->th.th_local.this_construct = 0;
4144 
4145  if (!this_thr->th.th_pri_common) {
4146  this_thr->th.th_pri_common =
4147  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4148  if (__kmp_storage_map) {
4149  __kmp_print_storage_map_gtid(
4150  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4151  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4152  }
4153  this_thr->th.th_pri_head = NULL;
4154  }
4155 
4156  /* Initialize dynamic dispatch */
4157  {
4158  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4159  // Use team max_nproc since this will never change for the team.
4160  size_t disp_size =
4161  sizeof(dispatch_private_info_t) *
4162  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4163  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4164  team->t.t_max_nproc));
4165  KMP_ASSERT(dispatch);
4166  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4167  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4168 
4169  dispatch->th_disp_index = 0;
4170 #if OMP_45_ENABLED
4171  dispatch->th_doacross_buf_idx = 0;
4172 #endif
4173  if (!dispatch->th_disp_buffer) {
4174  dispatch->th_disp_buffer =
4175  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4176 
4177  if (__kmp_storage_map) {
4178  __kmp_print_storage_map_gtid(
4179  gtid, &dispatch->th_disp_buffer[0],
4180  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4181  ? 1
4182  : __kmp_dispatch_num_buffers],
4183  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4184  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4185  gtid, team->t.t_id, gtid);
4186  }
4187  } else {
4188  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4189  }
4190 
4191  dispatch->th_dispatch_pr_current = 0;
4192  dispatch->th_dispatch_sh_current = 0;
4193 
4194  dispatch->th_deo_fcn = 0; /* ORDERED */
4195  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4196  }
4197 
4198  this_thr->th.th_next_pool = NULL;
4199 
4200  if (!this_thr->th.th_task_state_memo_stack) {
4201  size_t i;
4202  this_thr->th.th_task_state_memo_stack =
4203  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4204  this_thr->th.th_task_state_top = 0;
4205  this_thr->th.th_task_state_stack_sz = 4;
4206  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4207  ++i) // zero init the stack
4208  this_thr->th.th_task_state_memo_stack[i] = 0;
4209  }
4210 
4211  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4212  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4213 
4214  KMP_MB();
4215 }
4216 
4217 /* allocate a new thread for the requesting team. this is only called from
4218  within a forkjoin critical section. we will first try to get an available
4219  thread from the thread pool. if none is available, we will fork a new one
4220  assuming we are able to create a new one. this should be assured, as the
4221  caller should check on this first. */
4222 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4223  int new_tid) {
4224  kmp_team_t *serial_team;
4225  kmp_info_t *new_thr;
4226  int new_gtid;
4227 
4228  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4229  KMP_DEBUG_ASSERT(root && team);
4230 #if !KMP_NESTED_HOT_TEAMS
4231  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4232 #endif
4233  KMP_MB();
4234 
4235  /* first, try to get one from the thread pool */
4236  if (__kmp_thread_pool) {
4237 
4238  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4239  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4240  if (new_thr == __kmp_thread_pool_insert_pt) {
4241  __kmp_thread_pool_insert_pt = NULL;
4242  }
4243  TCW_4(new_thr->th.th_in_pool, FALSE);
4244  // Don't touch th_active_in_pool or th_active.
4245  // The worker thread adjusts those flags as it sleeps/awakens.
4246  __kmp_thread_pool_nth--;
4247 
4248  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4249  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4250  KMP_ASSERT(!new_thr->th.th_team);
4251  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4252  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4253 
4254  /* setup the thread structure */
4255  __kmp_initialize_info(new_thr, team, new_tid,
4256  new_thr->th.th_info.ds.ds_gtid);
4257  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4258 
4259  TCW_4(__kmp_nth, __kmp_nth + 1);
4260  root->r.r_cg_nthreads++;
4261 
4262  new_thr->th.th_task_state = 0;
4263  new_thr->th.th_task_state_top = 0;
4264  new_thr->th.th_task_state_stack_sz = 4;
4265 
4266 #ifdef KMP_ADJUST_BLOCKTIME
4267  /* Adjust blocktime back to zero if necessary */
4268  /* Middle initialization might not have occurred yet */
4269  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4270  if (__kmp_nth > __kmp_avail_proc) {
4271  __kmp_zero_bt = TRUE;
4272  }
4273  }
4274 #endif /* KMP_ADJUST_BLOCKTIME */
4275 
4276 #if KMP_DEBUG
4277  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4278  // KMP_BARRIER_PARENT_FLAG.
4279  int b;
4280  kmp_balign_t *balign = new_thr->th.th_bar;
4281  for (b = 0; b < bs_last_barrier; ++b)
4282  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4283 #endif
4284 
4285  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4286  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4287 
4288  KMP_MB();
4289  return new_thr;
4290  }
4291 
4292  /* no, well fork a new one */
4293  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4294  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4295 
4296 #if KMP_USE_MONITOR
4297  // If this is the first worker thread the RTL is creating, then also
4298  // launch the monitor thread. We try to do this as early as possible.
4299  if (!TCR_4(__kmp_init_monitor)) {
4300  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4301  if (!TCR_4(__kmp_init_monitor)) {
4302  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4303  TCW_4(__kmp_init_monitor, 1);
4304  __kmp_create_monitor(&__kmp_monitor);
4305  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4306 #if KMP_OS_WINDOWS
4307  // AC: wait until monitor has started. This is a fix for CQ232808.
4308  // The reason is that if the library is loaded/unloaded in a loop with
4309  // small (parallel) work in between, then there is high probability that
4310  // monitor thread started after the library shutdown. At shutdown it is
4311  // too late to cope with the problem, because when the master is in
4312  // DllMain (process detach) the monitor has no chances to start (it is
4313  // blocked), and master has no means to inform the monitor that the
4314  // library has gone, because all the memory which the monitor can access
4315  // is going to be released/reset.
4316  while (TCR_4(__kmp_init_monitor) < 2) {
4317  KMP_YIELD(TRUE);
4318  }
4319  KF_TRACE(10, ("after monitor thread has started\n"));
4320 #endif
4321  }
4322  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4323  }
4324 #endif
4325 
4326  KMP_MB();
4327  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4328  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4329  }
4330 
4331  /* allocate space for it. */
4332  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4333 
4334  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4335 
4336  if (__kmp_storage_map) {
4337  __kmp_print_thread_storage_map(new_thr, new_gtid);
4338  }
4339 
4340  // add the reserve serialized team, initialized from the team's master thread
4341  {
4342  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4343  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4344  new_thr->th.th_serial_team = serial_team =
4345  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4346 #if OMPT_SUPPORT
4347  ompt_data_none, // root parallel id
4348 #endif
4349 #if OMP_40_ENABLED
4350  proc_bind_default,
4351 #endif
4352  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4353  }
4354  KMP_ASSERT(serial_team);
4355  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4356  // execution (it is unused for now).
4357  serial_team->t.t_threads[0] = new_thr;
4358  KF_TRACE(10,
4359  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4360  new_thr));
4361 
4362  /* setup the thread structures */
4363  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4364 
4365 #if USE_FAST_MEMORY
4366  __kmp_initialize_fast_memory(new_thr);
4367 #endif /* USE_FAST_MEMORY */
4368 
4369 #if KMP_USE_BGET
4370  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4371  __kmp_initialize_bget(new_thr);
4372 #endif
4373 
4374  __kmp_init_random(new_thr); // Initialize random number generator
4375 
4376  /* Initialize these only once when thread is grabbed for a team allocation */
4377  KA_TRACE(20,
4378  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4379  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4380 
4381  int b;
4382  kmp_balign_t *balign = new_thr->th.th_bar;
4383  for (b = 0; b < bs_last_barrier; ++b) {
4384  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4385  balign[b].bb.team = NULL;
4386  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4387  balign[b].bb.use_oncore_barrier = 0;
4388  }
4389 
4390  new_thr->th.th_spin_here = FALSE;
4391  new_thr->th.th_next_waiting = 0;
4392 #if KMP_OS_UNIX
4393  new_thr->th.th_blocking = false;
4394 #endif
4395 
4396 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4397  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4398  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4399  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4400  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4401 #endif
4402 #if OMP_50_ENABLED
4403  new_thr->th.th_def_allocator = __kmp_def_allocator;
4404  new_thr->th.th_prev_level = 0;
4405  new_thr->th.th_prev_num_threads = 1;
4406 #endif
4407 
4408  TCW_4(new_thr->th.th_in_pool, FALSE);
4409  new_thr->th.th_active_in_pool = FALSE;
4410  TCW_4(new_thr->th.th_active, TRUE);
4411 
4412  /* adjust the global counters */
4413  __kmp_all_nth++;
4414  __kmp_nth++;
4415 
4416  root->r.r_cg_nthreads++;
4417 
4418  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4419  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4420  if (__kmp_adjust_gtid_mode) {
4421  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4422  if (TCR_4(__kmp_gtid_mode) != 2) {
4423  TCW_4(__kmp_gtid_mode, 2);
4424  }
4425  } else {
4426  if (TCR_4(__kmp_gtid_mode) != 1) {
4427  TCW_4(__kmp_gtid_mode, 1);
4428  }
4429  }
4430  }
4431 
4432 #ifdef KMP_ADJUST_BLOCKTIME
4433  /* Adjust blocktime back to zero if necessary */
4434  /* Middle initialization might not have occurred yet */
4435  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4436  if (__kmp_nth > __kmp_avail_proc) {
4437  __kmp_zero_bt = TRUE;
4438  }
4439  }
4440 #endif /* KMP_ADJUST_BLOCKTIME */
4441 
4442  /* actually fork it and create the new worker thread */
4443  KF_TRACE(
4444  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4445  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4446  KF_TRACE(10,
4447  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4448 
4449  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4450  new_gtid));
4451  KMP_MB();
4452  return new_thr;
4453 }
4454 
4455 /* Reinitialize team for reuse.
4456  The hot team code calls this case at every fork barrier, so EPCC barrier
4457  test are extremely sensitive to changes in it, esp. writes to the team
4458  struct, which cause a cache invalidation in all threads.
4459  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4460 static void __kmp_reinitialize_team(kmp_team_t *team,
4461  kmp_internal_control_t *new_icvs,
4462  ident_t *loc) {
4463  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4464  team->t.t_threads[0], team));
4465  KMP_DEBUG_ASSERT(team && new_icvs);
4466  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4467  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4468 
4469  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4470  // Copy ICVs to the master thread's implicit taskdata
4471  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4472  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4473 
4474  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4475  team->t.t_threads[0], team));
4476 }
4477 
4478 /* Initialize the team data structure.
4479  This assumes the t_threads and t_max_nproc are already set.
4480  Also, we don't touch the arguments */
4481 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4482  kmp_internal_control_t *new_icvs,
4483  ident_t *loc) {
4484  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4485 
4486  /* verify */
4487  KMP_DEBUG_ASSERT(team);
4488  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4489  KMP_DEBUG_ASSERT(team->t.t_threads);
4490  KMP_MB();
4491 
4492  team->t.t_master_tid = 0; /* not needed */
4493  /* team->t.t_master_bar; not needed */
4494  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4495  team->t.t_nproc = new_nproc;
4496 
4497  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4498  team->t.t_next_pool = NULL;
4499  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4500  * up hot team */
4501 
4502  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4503  team->t.t_invoke = NULL; /* not needed */
4504 
4505  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4506  team->t.t_sched.sched = new_icvs->sched.sched;
4507 
4508 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4509  team->t.t_fp_control_saved = FALSE; /* not needed */
4510  team->t.t_x87_fpu_control_word = 0; /* not needed */
4511  team->t.t_mxcsr = 0; /* not needed */
4512 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4513 
4514  team->t.t_construct = 0;
4515 
4516  team->t.t_ordered.dt.t_value = 0;
4517  team->t.t_master_active = FALSE;
4518 
4519  memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4520 
4521 #ifdef KMP_DEBUG
4522  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4523 #endif
4524 #if KMP_OS_WINDOWS
4525  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4526 #endif
4527 
4528  team->t.t_control_stack_top = NULL;
4529 
4530  __kmp_reinitialize_team(team, new_icvs, loc);
4531 
4532  KMP_MB();
4533  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4534 }
4535 
4536 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4537 /* Sets full mask for thread and returns old mask, no changes to structures. */
4538 static void
4539 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4540  if (KMP_AFFINITY_CAPABLE()) {
4541  int status;
4542  if (old_mask != NULL) {
4543  status = __kmp_get_system_affinity(old_mask, TRUE);
4544  int error = errno;
4545  if (status != 0) {
4546  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4547  __kmp_msg_null);
4548  }
4549  }
4550  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4551  }
4552 }
4553 #endif
4554 
4555 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4556 
4557 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4558 // It calculats the worker + master thread's partition based upon the parent
4559 // thread's partition, and binds each worker to a thread in their partition.
4560 // The master thread's partition should already include its current binding.
4561 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4562  // Copy the master thread's place partion to the team struct
4563  kmp_info_t *master_th = team->t.t_threads[0];
4564  KMP_DEBUG_ASSERT(master_th != NULL);
4565  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4566  int first_place = master_th->th.th_first_place;
4567  int last_place = master_th->th.th_last_place;
4568  int masters_place = master_th->th.th_current_place;
4569  team->t.t_first_place = first_place;
4570  team->t.t_last_place = last_place;
4571 
4572  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4573  "bound to place %d partition = [%d,%d]\n",
4574  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4575  team->t.t_id, masters_place, first_place, last_place));
4576 
4577  switch (proc_bind) {
4578 
4579  case proc_bind_default:
4580  // serial teams might have the proc_bind policy set to proc_bind_default. It
4581  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4582  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4583  break;
4584 
4585  case proc_bind_master: {
4586  int f;
4587  int n_th = team->t.t_nproc;
4588  for (f = 1; f < n_th; f++) {
4589  kmp_info_t *th = team->t.t_threads[f];
4590  KMP_DEBUG_ASSERT(th != NULL);
4591  th->th.th_first_place = first_place;
4592  th->th.th_last_place = last_place;
4593  th->th.th_new_place = masters_place;
4594 #if OMP_50_ENABLED
4595  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4596  team->t.t_display_affinity != 1) {
4597  team->t.t_display_affinity = 1;
4598  }
4599 #endif
4600 
4601  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4602  "partition = [%d,%d]\n",
4603  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4604  f, masters_place, first_place, last_place));
4605  }
4606  } break;
4607 
4608  case proc_bind_close: {
4609  int f;
4610  int n_th = team->t.t_nproc;
4611  int n_places;
4612  if (first_place <= last_place) {
4613  n_places = last_place - first_place + 1;
4614  } else {
4615  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4616  }
4617  if (n_th <= n_places) {
4618  int place = masters_place;
4619  for (f = 1; f < n_th; f++) {
4620  kmp_info_t *th = team->t.t_threads[f];
4621  KMP_DEBUG_ASSERT(th != NULL);
4622 
4623  if (place == last_place) {
4624  place = first_place;
4625  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4626  place = 0;
4627  } else {
4628  place++;
4629  }
4630  th->th.th_first_place = first_place;
4631  th->th.th_last_place = last_place;
4632  th->th.th_new_place = place;
4633 #if OMP_50_ENABLED
4634  if (__kmp_display_affinity && place != th->th.th_current_place &&
4635  team->t.t_display_affinity != 1) {
4636  team->t.t_display_affinity = 1;
4637  }
4638 #endif
4639 
4640  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4641  "partition = [%d,%d]\n",
4642  __kmp_gtid_from_thread(team->t.t_threads[f]),
4643  team->t.t_id, f, place, first_place, last_place));
4644  }
4645  } else {
4646  int S, rem, gap, s_count;
4647  S = n_th / n_places;
4648  s_count = 0;
4649  rem = n_th - (S * n_places);
4650  gap = rem > 0 ? n_places / rem : n_places;
4651  int place = masters_place;
4652  int gap_ct = gap;
4653  for (f = 0; f < n_th; f++) {
4654  kmp_info_t *th = team->t.t_threads[f];
4655  KMP_DEBUG_ASSERT(th != NULL);
4656 
4657  th->th.th_first_place = first_place;
4658  th->th.th_last_place = last_place;
4659  th->th.th_new_place = place;
4660 #if OMP_50_ENABLED
4661  if (__kmp_display_affinity && place != th->th.th_current_place &&
4662  team->t.t_display_affinity != 1) {
4663  team->t.t_display_affinity = 1;
4664  }
4665 #endif
4666  s_count++;
4667 
4668  if ((s_count == S) && rem && (gap_ct == gap)) {
4669  // do nothing, add an extra thread to place on next iteration
4670  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4671  // we added an extra thread to this place; move to next place
4672  if (place == last_place) {
4673  place = first_place;
4674  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4675  place = 0;
4676  } else {
4677  place++;
4678  }
4679  s_count = 0;
4680  gap_ct = 1;
4681  rem--;
4682  } else if (s_count == S) { // place full; don't add extra
4683  if (place == last_place) {
4684  place = first_place;
4685  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4686  place = 0;
4687  } else {
4688  place++;
4689  }
4690  gap_ct++;
4691  s_count = 0;
4692  }
4693 
4694  KA_TRACE(100,
4695  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4696  "partition = [%d,%d]\n",
4697  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4698  th->th.th_new_place, first_place, last_place));
4699  }
4700  KMP_DEBUG_ASSERT(place == masters_place);
4701  }
4702  } break;
4703 
4704  case proc_bind_spread: {
4705  int f;
4706  int n_th = team->t.t_nproc;
4707  int n_places;
4708  int thidx;
4709  if (first_place <= last_place) {
4710  n_places = last_place - first_place + 1;
4711  } else {
4712  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4713  }
4714  if (n_th <= n_places) {
4715  int place = -1;
4716 
4717  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4718  int S = n_places / n_th;
4719  int s_count, rem, gap, gap_ct;
4720 
4721  place = masters_place;
4722  rem = n_places - n_th * S;
4723  gap = rem ? n_th / rem : 1;
4724  gap_ct = gap;
4725  thidx = n_th;
4726  if (update_master_only == 1)
4727  thidx = 1;
4728  for (f = 0; f < thidx; f++) {
4729  kmp_info_t *th = team->t.t_threads[f];
4730  KMP_DEBUG_ASSERT(th != NULL);
4731 
4732  th->th.th_first_place = place;
4733  th->th.th_new_place = place;
4734 #if OMP_50_ENABLED
4735  if (__kmp_display_affinity && place != th->th.th_current_place &&
4736  team->t.t_display_affinity != 1) {
4737  team->t.t_display_affinity = 1;
4738  }
4739 #endif
4740  s_count = 1;
4741  while (s_count < S) {
4742  if (place == last_place) {
4743  place = first_place;
4744  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4745  place = 0;
4746  } else {
4747  place++;
4748  }
4749  s_count++;
4750  }
4751  if (rem && (gap_ct == gap)) {
4752  if (place == last_place) {
4753  place = first_place;
4754  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4755  place = 0;
4756  } else {
4757  place++;
4758  }
4759  rem--;
4760  gap_ct = 0;
4761  }
4762  th->th.th_last_place = place;
4763  gap_ct++;
4764 
4765  if (place == last_place) {
4766  place = first_place;
4767  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4768  place = 0;
4769  } else {
4770  place++;
4771  }
4772 
4773  KA_TRACE(100,
4774  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4775  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4776  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4777  f, th->th.th_new_place, th->th.th_first_place,
4778  th->th.th_last_place, __kmp_affinity_num_masks));
4779  }
4780  } else {
4781  /* Having uniform space of available computation places I can create
4782  T partitions of round(P/T) size and put threads into the first
4783  place of each partition. */
4784  double current = static_cast<double>(masters_place);
4785  double spacing =
4786  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4787  int first, last;
4788  kmp_info_t *th;
4789 
4790  thidx = n_th + 1;
4791  if (update_master_only == 1)
4792  thidx = 1;
4793  for (f = 0; f < thidx; f++) {
4794  first = static_cast<int>(current);
4795  last = static_cast<int>(current + spacing) - 1;
4796  KMP_DEBUG_ASSERT(last >= first);
4797  if (first >= n_places) {
4798  if (masters_place) {
4799  first -= n_places;
4800  last -= n_places;
4801  if (first == (masters_place + 1)) {
4802  KMP_DEBUG_ASSERT(f == n_th);
4803  first--;
4804  }
4805  if (last == masters_place) {
4806  KMP_DEBUG_ASSERT(f == (n_th - 1));
4807  last--;
4808  }
4809  } else {
4810  KMP_DEBUG_ASSERT(f == n_th);
4811  first = 0;
4812  last = 0;
4813  }
4814  }
4815  if (last >= n_places) {
4816  last = (n_places - 1);
4817  }
4818  place = first;
4819  current += spacing;
4820  if (f < n_th) {
4821  KMP_DEBUG_ASSERT(0 <= first);
4822  KMP_DEBUG_ASSERT(n_places > first);
4823  KMP_DEBUG_ASSERT(0 <= last);
4824  KMP_DEBUG_ASSERT(n_places > last);
4825  KMP_DEBUG_ASSERT(last_place >= first_place);
4826  th = team->t.t_threads[f];
4827  KMP_DEBUG_ASSERT(th);
4828  th->th.th_first_place = first;
4829  th->th.th_new_place = place;
4830  th->th.th_last_place = last;
4831 #if OMP_50_ENABLED
4832  if (__kmp_display_affinity && place != th->th.th_current_place &&
4833  team->t.t_display_affinity != 1) {
4834  team->t.t_display_affinity = 1;
4835  }
4836 #endif
4837  KA_TRACE(100,
4838  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4839  "partition = [%d,%d], spacing = %.4f\n",
4840  __kmp_gtid_from_thread(team->t.t_threads[f]),
4841  team->t.t_id, f, th->th.th_new_place,
4842  th->th.th_first_place, th->th.th_last_place, spacing));
4843  }
4844  }
4845  }
4846  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4847  } else {
4848  int S, rem, gap, s_count;
4849  S = n_th / n_places;
4850  s_count = 0;
4851  rem = n_th - (S * n_places);
4852  gap = rem > 0 ? n_places / rem : n_places;
4853  int place = masters_place;
4854  int gap_ct = gap;
4855  thidx = n_th;
4856  if (update_master_only == 1)
4857  thidx = 1;
4858  for (f = 0; f < thidx; f++) {
4859  kmp_info_t *th = team->t.t_threads[f];
4860  KMP_DEBUG_ASSERT(th != NULL);
4861 
4862  th->th.th_first_place = place;
4863  th->th.th_last_place = place;
4864  th->th.th_new_place = place;
4865 #if OMP_50_ENABLED
4866  if (__kmp_display_affinity && place != th->th.th_current_place &&
4867  team->t.t_display_affinity != 1) {
4868  team->t.t_display_affinity = 1;
4869  }
4870 #endif
4871  s_count++;
4872 
4873  if ((s_count == S) && rem && (gap_ct == gap)) {
4874  // do nothing, add an extra thread to place on next iteration
4875  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4876  // we added an extra thread to this place; move on to next place
4877  if (place == last_place) {
4878  place = first_place;
4879  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4880  place = 0;
4881  } else {
4882  place++;
4883  }
4884  s_count = 0;
4885  gap_ct = 1;
4886  rem--;
4887  } else if (s_count == S) { // place is full; don't add extra thread
4888  if (place == last_place) {
4889  place = first_place;
4890  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4891  place = 0;
4892  } else {
4893  place++;
4894  }
4895  gap_ct++;
4896  s_count = 0;
4897  }
4898 
4899  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4900  "partition = [%d,%d]\n",
4901  __kmp_gtid_from_thread(team->t.t_threads[f]),
4902  team->t.t_id, f, th->th.th_new_place,
4903  th->th.th_first_place, th->th.th_last_place));
4904  }
4905  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4906  }
4907  } break;
4908 
4909  default:
4910  break;
4911  }
4912 
4913  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4914 }
4915 
4916 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4917 
4918 /* allocate a new team data structure to use. take one off of the free pool if
4919  available */
4920 kmp_team_t *
4921 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4922 #if OMPT_SUPPORT
4923  ompt_data_t ompt_parallel_data,
4924 #endif
4925 #if OMP_40_ENABLED
4926  kmp_proc_bind_t new_proc_bind,
4927 #endif
4928  kmp_internal_control_t *new_icvs,
4929  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4930  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4931  int f;
4932  kmp_team_t *team;
4933  int use_hot_team = !root->r.r_active;
4934  int level = 0;
4935 
4936  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4937  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4938  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4939  KMP_MB();
4940 
4941 #if KMP_NESTED_HOT_TEAMS
4942  kmp_hot_team_ptr_t *hot_teams;
4943  if (master) {
4944  team = master->th.th_team;
4945  level = team->t.t_active_level;
4946  if (master->th.th_teams_microtask) { // in teams construct?
4947  if (master->th.th_teams_size.nteams > 1 &&
4948  ( // #teams > 1
4949  team->t.t_pkfn ==
4950  (microtask_t)__kmp_teams_master || // inner fork of the teams
4951  master->th.th_teams_level <
4952  team->t.t_level)) { // or nested parallel inside the teams
4953  ++level; // not increment if #teams==1, or for outer fork of the teams;
4954  // increment otherwise
4955  }
4956  }
4957  hot_teams = master->th.th_hot_teams;
4958  if (level < __kmp_hot_teams_max_level && hot_teams &&
4959  hot_teams[level]
4960  .hot_team) { // hot team has already been allocated for given level
4961  use_hot_team = 1;
4962  } else {
4963  use_hot_team = 0;
4964  }
4965  }
4966 #endif
4967  // Optimization to use a "hot" team
4968  if (use_hot_team && new_nproc > 1) {
4969  KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4970 #if KMP_NESTED_HOT_TEAMS
4971  team = hot_teams[level].hot_team;
4972 #else
4973  team = root->r.r_hot_team;
4974 #endif
4975 #if KMP_DEBUG
4976  if (__kmp_tasking_mode != tskm_immediate_exec) {
4977  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4978  "task_team[1] = %p before reinit\n",
4979  team->t.t_task_team[0], team->t.t_task_team[1]));
4980  }
4981 #endif
4982 
4983  // Has the number of threads changed?
4984  /* Let's assume the most common case is that the number of threads is
4985  unchanged, and put that case first. */
4986  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4987  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4988  // This case can mean that omp_set_num_threads() was called and the hot
4989  // team size was already reduced, so we check the special flag
4990  if (team->t.t_size_changed == -1) {
4991  team->t.t_size_changed = 1;
4992  } else {
4993  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4994  }
4995 
4996  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4997  kmp_r_sched_t new_sched = new_icvs->sched;
4998  // set master's schedule as new run-time schedule
4999  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5000 
5001  __kmp_reinitialize_team(team, new_icvs,
5002  root->r.r_uber_thread->th.th_ident);
5003 
5004  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5005  team->t.t_threads[0], team));
5006  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5007 
5008 #if OMP_40_ENABLED
5009 #if KMP_AFFINITY_SUPPORTED
5010  if ((team->t.t_size_changed == 0) &&
5011  (team->t.t_proc_bind == new_proc_bind)) {
5012  if (new_proc_bind == proc_bind_spread) {
5013  __kmp_partition_places(
5014  team, 1); // add flag to update only master for spread
5015  }
5016  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5017  "proc_bind = %d, partition = [%d,%d]\n",
5018  team->t.t_id, new_proc_bind, team->t.t_first_place,
5019  team->t.t_last_place));
5020  } else {
5021  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5022  __kmp_partition_places(team);
5023  }
5024 #else
5025  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5026 #endif /* KMP_AFFINITY_SUPPORTED */
5027 #endif /* OMP_40_ENABLED */
5028  } else if (team->t.t_nproc > new_nproc) {
5029  KA_TRACE(20,
5030  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5031  new_nproc));
5032 
5033  team->t.t_size_changed = 1;
5034 #if KMP_NESTED_HOT_TEAMS
5035  if (__kmp_hot_teams_mode == 0) {
5036  // AC: saved number of threads should correspond to team's value in this
5037  // mode, can be bigger in mode 1, when hot team has threads in reserve
5038  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5039  hot_teams[level].hot_team_nth = new_nproc;
5040 #endif // KMP_NESTED_HOT_TEAMS
5041  /* release the extra threads we don't need any more */
5042  for (f = new_nproc; f < team->t.t_nproc; f++) {
5043  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5044  if (__kmp_tasking_mode != tskm_immediate_exec) {
5045  // When decreasing team size, threads no longer in the team should
5046  // unref task team.
5047  team->t.t_threads[f]->th.th_task_team = NULL;
5048  }
5049  __kmp_free_thread(team->t.t_threads[f]);
5050  team->t.t_threads[f] = NULL;
5051  }
5052 #if KMP_NESTED_HOT_TEAMS
5053  } // (__kmp_hot_teams_mode == 0)
5054  else {
5055  // When keeping extra threads in team, switch threads to wait on own
5056  // b_go flag
5057  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5058  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5059  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5060  for (int b = 0; b < bs_last_barrier; ++b) {
5061  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5062  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5063  }
5064  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5065  }
5066  }
5067  }
5068 #endif // KMP_NESTED_HOT_TEAMS
5069  team->t.t_nproc = new_nproc;
5070  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5071  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5072  __kmp_reinitialize_team(team, new_icvs,
5073  root->r.r_uber_thread->th.th_ident);
5074 
5075  /* update the remaining threads */
5076  for (f = 0; f < new_nproc; ++f) {
5077  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5078  }
5079  // restore the current task state of the master thread: should be the
5080  // implicit task
5081  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5082  team->t.t_threads[0], team));
5083 
5084  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5085 
5086 #ifdef KMP_DEBUG
5087  for (f = 0; f < team->t.t_nproc; f++) {
5088  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5089  team->t.t_threads[f]->th.th_team_nproc ==
5090  team->t.t_nproc);
5091  }
5092 #endif
5093 
5094 #if OMP_40_ENABLED
5095  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5096 #if KMP_AFFINITY_SUPPORTED
5097  __kmp_partition_places(team);
5098 #endif
5099 #endif
5100  } else { // team->t.t_nproc < new_nproc
5101 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5102  kmp_affin_mask_t *old_mask;
5103  if (KMP_AFFINITY_CAPABLE()) {
5104  KMP_CPU_ALLOC(old_mask);
5105  }
5106 #endif
5107 
5108  KA_TRACE(20,
5109  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5110  new_nproc));
5111 
5112  team->t.t_size_changed = 1;
5113 
5114 #if KMP_NESTED_HOT_TEAMS
5115  int avail_threads = hot_teams[level].hot_team_nth;
5116  if (new_nproc < avail_threads)
5117  avail_threads = new_nproc;
5118  kmp_info_t **other_threads = team->t.t_threads;
5119  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5120  // Adjust barrier data of reserved threads (if any) of the team
5121  // Other data will be set in __kmp_initialize_info() below.
5122  int b;
5123  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5124  for (b = 0; b < bs_last_barrier; ++b) {
5125  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5126  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5127 #if USE_DEBUGGER
5128  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5129 #endif
5130  }
5131  }
5132  if (hot_teams[level].hot_team_nth >= new_nproc) {
5133  // we have all needed threads in reserve, no need to allocate any
5134  // this only possible in mode 1, cannot have reserved threads in mode 0
5135  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5136  team->t.t_nproc = new_nproc; // just get reserved threads involved
5137  } else {
5138  // we may have some threads in reserve, but not enough
5139  team->t.t_nproc =
5140  hot_teams[level]
5141  .hot_team_nth; // get reserved threads involved if any
5142  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5143 #endif // KMP_NESTED_HOT_TEAMS
5144  if (team->t.t_max_nproc < new_nproc) {
5145  /* reallocate larger arrays */
5146  __kmp_reallocate_team_arrays(team, new_nproc);
5147  __kmp_reinitialize_team(team, new_icvs, NULL);
5148  }
5149 
5150 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5151  /* Temporarily set full mask for master thread before creation of
5152  workers. The reason is that workers inherit the affinity from master,
5153  so if a lot of workers are created on the single core quickly, they
5154  don't get a chance to set their own affinity for a long time. */
5155  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5156 #endif
5157 
5158  /* allocate new threads for the hot team */
5159  for (f = team->t.t_nproc; f < new_nproc; f++) {
5160  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5161  KMP_DEBUG_ASSERT(new_worker);
5162  team->t.t_threads[f] = new_worker;
5163 
5164  KA_TRACE(20,
5165  ("__kmp_allocate_team: team %d init T#%d arrived: "
5166  "join=%llu, plain=%llu\n",
5167  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5168  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5169  team->t.t_bar[bs_plain_barrier].b_arrived));
5170 
5171  { // Initialize barrier data for new threads.
5172  int b;
5173  kmp_balign_t *balign = new_worker->th.th_bar;
5174  for (b = 0; b < bs_last_barrier; ++b) {
5175  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5176  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5177  KMP_BARRIER_PARENT_FLAG);
5178 #if USE_DEBUGGER
5179  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5180 #endif
5181  }
5182  }
5183  }
5184 
5185 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5186  if (KMP_AFFINITY_CAPABLE()) {
5187  /* Restore initial master thread's affinity mask */
5188  __kmp_set_system_affinity(old_mask, TRUE);
5189  KMP_CPU_FREE(old_mask);
5190  }
5191 #endif
5192 #if KMP_NESTED_HOT_TEAMS
5193  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5194 #endif // KMP_NESTED_HOT_TEAMS
5195  /* make sure everyone is syncronized */
5196  int old_nproc = team->t.t_nproc; // save old value and use to update only
5197  // new threads below
5198  __kmp_initialize_team(team, new_nproc, new_icvs,
5199  root->r.r_uber_thread->th.th_ident);
5200 
5201  /* reinitialize the threads */
5202  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5203  for (f = 0; f < team->t.t_nproc; ++f)
5204  __kmp_initialize_info(team->t.t_threads[f], team, f,
5205  __kmp_gtid_from_tid(f, team));
5206  if (level) { // set th_task_state for new threads in nested hot team
5207  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5208  // only need to set the th_task_state for the new threads. th_task_state
5209  // for master thread will not be accurate until after this in
5210  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5211  // correct value.
5212  for (f = old_nproc; f < team->t.t_nproc; ++f)
5213  team->t.t_threads[f]->th.th_task_state =
5214  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5215  } else { // set th_task_state for new threads in non-nested hot team
5216  int old_state =
5217  team->t.t_threads[0]->th.th_task_state; // copy master's state
5218  for (f = old_nproc; f < team->t.t_nproc; ++f)
5219  team->t.t_threads[f]->th.th_task_state = old_state;
5220  }
5221 
5222 #ifdef KMP_DEBUG
5223  for (f = 0; f < team->t.t_nproc; ++f) {
5224  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5225  team->t.t_threads[f]->th.th_team_nproc ==
5226  team->t.t_nproc);
5227  }
5228 #endif
5229 
5230 #if OMP_40_ENABLED
5231  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5232 #if KMP_AFFINITY_SUPPORTED
5233  __kmp_partition_places(team);
5234 #endif
5235 #endif
5236  } // Check changes in number of threads
5237 
5238 #if OMP_40_ENABLED
5239  kmp_info_t *master = team->t.t_threads[0];
5240  if (master->th.th_teams_microtask) {
5241  for (f = 1; f < new_nproc; ++f) {
5242  // propagate teams construct specific info to workers
5243  kmp_info_t *thr = team->t.t_threads[f];
5244  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5245  thr->th.th_teams_level = master->th.th_teams_level;
5246  thr->th.th_teams_size = master->th.th_teams_size;
5247  }
5248  }
5249 #endif /* OMP_40_ENABLED */
5250 #if KMP_NESTED_HOT_TEAMS
5251  if (level) {
5252  // Sync barrier state for nested hot teams, not needed for outermost hot
5253  // team.
5254  for (f = 1; f < new_nproc; ++f) {
5255  kmp_info_t *thr = team->t.t_threads[f];
5256  int b;
5257  kmp_balign_t *balign = thr->th.th_bar;
5258  for (b = 0; b < bs_last_barrier; ++b) {
5259  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5260  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5261 #if USE_DEBUGGER
5262  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5263 #endif
5264  }
5265  }
5266  }
5267 #endif // KMP_NESTED_HOT_TEAMS
5268 
5269  /* reallocate space for arguments if necessary */
5270  __kmp_alloc_argv_entries(argc, team, TRUE);
5271  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5272  // The hot team re-uses the previous task team,
5273  // if untouched during the previous release->gather phase.
5274 
5275  KF_TRACE(10, (" hot_team = %p\n", team));
5276 
5277 #if KMP_DEBUG
5278  if (__kmp_tasking_mode != tskm_immediate_exec) {
5279  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5280  "task_team[1] = %p after reinit\n",
5281  team->t.t_task_team[0], team->t.t_task_team[1]));
5282  }
5283 #endif
5284 
5285 #if OMPT_SUPPORT
5286  __ompt_team_assign_id(team, ompt_parallel_data);
5287 #endif
5288 
5289  KMP_MB();
5290 
5291  return team;
5292  }
5293 
5294  /* next, let's try to take one from the team pool */
5295  KMP_MB();
5296  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5297  /* TODO: consider resizing undersized teams instead of reaping them, now
5298  that we have a resizing mechanism */
5299  if (team->t.t_max_nproc >= max_nproc) {
5300  /* take this team from the team pool */
5301  __kmp_team_pool = team->t.t_next_pool;
5302 
5303  /* setup the team for fresh use */
5304  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5305 
5306  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5307  "task_team[1] %p to NULL\n",
5308  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5309  team->t.t_task_team[0] = NULL;
5310  team->t.t_task_team[1] = NULL;
5311 
5312  /* reallocate space for arguments if necessary */
5313  __kmp_alloc_argv_entries(argc, team, TRUE);
5314  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5315 
5316  KA_TRACE(
5317  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5318  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5319  { // Initialize barrier data.
5320  int b;
5321  for (b = 0; b < bs_last_barrier; ++b) {
5322  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5323 #if USE_DEBUGGER
5324  team->t.t_bar[b].b_master_arrived = 0;
5325  team->t.t_bar[b].b_team_arrived = 0;
5326 #endif
5327  }
5328  }
5329 
5330 #if OMP_40_ENABLED
5331  team->t.t_proc_bind = new_proc_bind;
5332 #endif
5333 
5334  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5335  team->t.t_id));
5336 
5337 #if OMPT_SUPPORT
5338  __ompt_team_assign_id(team, ompt_parallel_data);
5339 #endif
5340 
5341  KMP_MB();
5342 
5343  return team;
5344  }
5345 
5346  /* reap team if it is too small, then loop back and check the next one */
5347  // not sure if this is wise, but, will be redone during the hot-teams
5348  // rewrite.
5349  /* TODO: Use technique to find the right size hot-team, don't reap them */
5350  team = __kmp_reap_team(team);
5351  __kmp_team_pool = team;
5352  }
5353 
5354  /* nothing available in the pool, no matter, make a new team! */
5355  KMP_MB();
5356  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5357 
5358  /* and set it up */
5359  team->t.t_max_nproc = max_nproc;
5360  /* NOTE well, for some reason allocating one big buffer and dividing it up
5361  seems to really hurt performance a lot on the P4, so, let's not use this */
5362  __kmp_allocate_team_arrays(team, max_nproc);
5363 
5364  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5365  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5366 
5367  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5368  "%p to NULL\n",
5369  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5370  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5371  // memory, no need to duplicate
5372  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5373  // memory, no need to duplicate
5374 
5375  if (__kmp_storage_map) {
5376  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5377  }
5378 
5379  /* allocate space for arguments */
5380  __kmp_alloc_argv_entries(argc, team, FALSE);
5381  team->t.t_argc = argc;
5382 
5383  KA_TRACE(20,
5384  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5385  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5386  { // Initialize barrier data.
5387  int b;
5388  for (b = 0; b < bs_last_barrier; ++b) {
5389  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5390 #if USE_DEBUGGER
5391  team->t.t_bar[b].b_master_arrived = 0;
5392  team->t.t_bar[b].b_team_arrived = 0;
5393 #endif
5394  }
5395  }
5396 
5397 #if OMP_40_ENABLED
5398  team->t.t_proc_bind = new_proc_bind;
5399 #endif
5400 
5401 #if OMPT_SUPPORT
5402  __ompt_team_assign_id(team, ompt_parallel_data);
5403  team->t.ompt_serialized_team_info = NULL;
5404 #endif
5405 
5406  KMP_MB();
5407 
5408  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5409  team->t.t_id));
5410 
5411  return team;
5412 }
5413 
5414 /* TODO implement hot-teams at all levels */
5415 /* TODO implement lazy thread release on demand (disband request) */
5416 
5417 /* free the team. return it to the team pool. release all the threads
5418  * associated with it */
5419 void __kmp_free_team(kmp_root_t *root,
5420  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5421  int f;
5422  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5423  team->t.t_id));
5424 
5425  /* verify state */
5426  KMP_DEBUG_ASSERT(root);
5427  KMP_DEBUG_ASSERT(team);
5428  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5429  KMP_DEBUG_ASSERT(team->t.t_threads);
5430 
5431  int use_hot_team = team == root->r.r_hot_team;
5432 #if KMP_NESTED_HOT_TEAMS
5433  int level;
5434  kmp_hot_team_ptr_t *hot_teams;
5435  if (master) {
5436  level = team->t.t_active_level - 1;
5437  if (master->th.th_teams_microtask) { // in teams construct?
5438  if (master->th.th_teams_size.nteams > 1) {
5439  ++level; // level was not increased in teams construct for
5440  // team_of_masters
5441  }
5442  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5443  master->th.th_teams_level == team->t.t_level) {
5444  ++level; // level was not increased in teams construct for
5445  // team_of_workers before the parallel
5446  } // team->t.t_level will be increased inside parallel
5447  }
5448  hot_teams = master->th.th_hot_teams;
5449  if (level < __kmp_hot_teams_max_level) {
5450  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5451  use_hot_team = 1;
5452  }
5453  }
5454 #endif // KMP_NESTED_HOT_TEAMS
5455 
5456  /* team is done working */
5457  TCW_SYNC_PTR(team->t.t_pkfn,
5458  NULL); // Important for Debugging Support Library.
5459 #if KMP_OS_WINDOWS
5460  team->t.t_copyin_counter = 0; // init counter for possible reuse
5461 #endif
5462  // Do not reset pointer to parent team to NULL for hot teams.
5463 
5464  /* if we are non-hot team, release our threads */
5465  if (!use_hot_team) {
5466  if (__kmp_tasking_mode != tskm_immediate_exec) {
5467  // Wait for threads to reach reapable state
5468  for (f = 1; f < team->t.t_nproc; ++f) {
5469  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5470  kmp_info_t *th = team->t.t_threads[f];
5471  volatile kmp_uint32 *state = &th->th.th_reap_state;
5472  while (*state != KMP_SAFE_TO_REAP) {
5473 #if KMP_OS_WINDOWS
5474  // On Windows a thread can be killed at any time, check this
5475  DWORD ecode;
5476  if (!__kmp_is_thread_alive(th, &ecode)) {
5477  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5478  break;
5479  }
5480 #endif
5481  // first check if thread is sleeping
5482  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5483  if (fl.is_sleeping())
5484  fl.resume(__kmp_gtid_from_thread(th));
5485  KMP_CPU_PAUSE();
5486  }
5487  }
5488 
5489  // Delete task teams
5490  int tt_idx;
5491  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5492  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5493  if (task_team != NULL) {
5494  for (f = 0; f < team->t.t_nproc;
5495  ++f) { // Have all threads unref task teams
5496  team->t.t_threads[f]->th.th_task_team = NULL;
5497  }
5498  KA_TRACE(
5499  20,
5500  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5501  __kmp_get_gtid(), task_team, team->t.t_id));
5502 #if KMP_NESTED_HOT_TEAMS
5503  __kmp_free_task_team(master, task_team);
5504 #endif
5505  team->t.t_task_team[tt_idx] = NULL;
5506  }
5507  }
5508  }
5509 
5510  // Reset pointer to parent team only for non-hot teams.
5511  team->t.t_parent = NULL;
5512  team->t.t_level = 0;
5513  team->t.t_active_level = 0;
5514 
5515  /* free the worker threads */
5516  for (f = 1; f < team->t.t_nproc; ++f) {
5517  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5518  __kmp_free_thread(team->t.t_threads[f]);
5519  team->t.t_threads[f] = NULL;
5520  }
5521 
5522  /* put the team back in the team pool */
5523  /* TODO limit size of team pool, call reap_team if pool too large */
5524  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5525  __kmp_team_pool = (volatile kmp_team_t *)team;
5526  }
5527 
5528  KMP_MB();
5529 }
5530 
5531 /* reap the team. destroy it, reclaim all its resources and free its memory */
5532 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5533  kmp_team_t *next_pool = team->t.t_next_pool;
5534 
5535  KMP_DEBUG_ASSERT(team);
5536  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5537  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5538  KMP_DEBUG_ASSERT(team->t.t_threads);
5539  KMP_DEBUG_ASSERT(team->t.t_argv);
5540 
5541  /* TODO clean the threads that are a part of this? */
5542 
5543  /* free stuff */
5544  __kmp_free_team_arrays(team);
5545  if (team->t.t_argv != &team->t.t_inline_argv[0])
5546  __kmp_free((void *)team->t.t_argv);
5547  __kmp_free(team);
5548 
5549  KMP_MB();
5550  return next_pool;
5551 }
5552 
5553 // Free the thread. Don't reap it, just place it on the pool of available
5554 // threads.
5555 //
5556 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5557 // binding for the affinity mechanism to be useful.
5558 //
5559 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5560 // However, we want to avoid a potential performance problem by always
5561 // scanning through the list to find the correct point at which to insert
5562 // the thread (potential N**2 behavior). To do this we keep track of the
5563 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5564 // With single-level parallelism, threads will always be added to the tail
5565 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5566 // parallelism, all bets are off and we may need to scan through the entire
5567 // free list.
5568 //
5569 // This change also has a potentially large performance benefit, for some
5570 // applications. Previously, as threads were freed from the hot team, they
5571 // would be placed back on the free list in inverse order. If the hot team
5572 // grew back to it's original size, then the freed thread would be placed
5573 // back on the hot team in reverse order. This could cause bad cache
5574 // locality problems on programs where the size of the hot team regularly
5575 // grew and shrunk.
5576 //
5577 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5578 void __kmp_free_thread(kmp_info_t *this_th) {
5579  int gtid;
5580  kmp_info_t **scan;
5581  kmp_root_t *root = this_th->th.th_root;
5582 
5583  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5584  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5585 
5586  KMP_DEBUG_ASSERT(this_th);
5587 
5588  // When moving thread to pool, switch thread to wait on own b_go flag, and
5589  // uninitialized (NULL team).
5590  int b;
5591  kmp_balign_t *balign = this_th->th.th_bar;
5592  for (b = 0; b < bs_last_barrier; ++b) {
5593  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5594  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5595  balign[b].bb.team = NULL;
5596  balign[b].bb.leaf_kids = 0;
5597  }
5598  this_th->th.th_task_state = 0;
5599  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5600 
5601  /* put thread back on the free pool */
5602  TCW_PTR(this_th->th.th_team, NULL);
5603  TCW_PTR(this_th->th.th_root, NULL);
5604  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5605 
5606  /* If the implicit task assigned to this thread can be used by other threads
5607  * -> multiple threads can share the data and try to free the task at
5608  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5609  * with higher probability when hot team is disabled but can occurs even when
5610  * the hot team is enabled */
5611  __kmp_free_implicit_task(this_th);
5612  this_th->th.th_current_task = NULL;
5613 
5614  // If the __kmp_thread_pool_insert_pt is already past the new insert
5615  // point, then we need to re-scan the entire list.
5616  gtid = this_th->th.th_info.ds.ds_gtid;
5617  if (__kmp_thread_pool_insert_pt != NULL) {
5618  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5619  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5620  __kmp_thread_pool_insert_pt = NULL;
5621  }
5622  }
5623 
5624  // Scan down the list to find the place to insert the thread.
5625  // scan is the address of a link in the list, possibly the address of
5626  // __kmp_thread_pool itself.
5627  //
5628  // In the absence of nested parallism, the for loop will have 0 iterations.
5629  if (__kmp_thread_pool_insert_pt != NULL) {
5630  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5631  } else {
5632  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5633  }
5634  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5635  scan = &((*scan)->th.th_next_pool))
5636  ;
5637 
5638  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5639  // to its address.
5640  TCW_PTR(this_th->th.th_next_pool, *scan);
5641  __kmp_thread_pool_insert_pt = *scan = this_th;
5642  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5643  (this_th->th.th_info.ds.ds_gtid <
5644  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5645  TCW_4(this_th->th.th_in_pool, TRUE);
5646  __kmp_thread_pool_nth++;
5647 
5648  TCW_4(__kmp_nth, __kmp_nth - 1);
5649  root->r.r_cg_nthreads--;
5650 
5651 #ifdef KMP_ADJUST_BLOCKTIME
5652  /* Adjust blocktime back to user setting or default if necessary */
5653  /* Middle initialization might never have occurred */
5654  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5655  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5656  if (__kmp_nth <= __kmp_avail_proc) {
5657  __kmp_zero_bt = FALSE;
5658  }
5659  }
5660 #endif /* KMP_ADJUST_BLOCKTIME */
5661 
5662  KMP_MB();
5663 }
5664 
5665 /* ------------------------------------------------------------------------ */
5666 
5667 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5668  int gtid = this_thr->th.th_info.ds.ds_gtid;
5669  /* void *stack_data;*/
5670  kmp_team_t *(*volatile pteam);
5671 
5672  KMP_MB();
5673  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5674 
5675  if (__kmp_env_consistency_check) {
5676  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5677  }
5678 
5679 #if OMPT_SUPPORT
5680  ompt_data_t *thread_data;
5681  if (ompt_enabled.enabled) {
5682  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5683  *thread_data = ompt_data_none;
5684 
5685  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5686  this_thr->th.ompt_thread_info.wait_id = 0;
5687  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5688  if (ompt_enabled.ompt_callback_thread_begin) {
5689  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5690  ompt_thread_worker, thread_data);
5691  }
5692  }
5693 #endif
5694 
5695 #if OMPT_SUPPORT
5696  if (ompt_enabled.enabled) {
5697  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5698  }
5699 #endif
5700  /* This is the place where threads wait for work */
5701  while (!TCR_4(__kmp_global.g.g_done)) {
5702  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5703  KMP_MB();
5704 
5705  /* wait for work to do */
5706  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5707 
5708  /* No tid yet since not part of a team */
5709  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5710 
5711 #if OMPT_SUPPORT
5712  if (ompt_enabled.enabled) {
5713  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5714  }
5715 #endif
5716 
5717  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5718 
5719  /* have we been allocated? */
5720  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5721  /* we were just woken up, so run our new task */
5722  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5723  int rc;
5724  KA_TRACE(20,
5725  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5726  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5727  (*pteam)->t.t_pkfn));
5728 
5729  updateHWFPControl(*pteam);
5730 
5731 #if OMPT_SUPPORT
5732  if (ompt_enabled.enabled) {
5733  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5734  }
5735 #endif
5736 
5737  rc = (*pteam)->t.t_invoke(gtid);
5738  KMP_ASSERT(rc);
5739 
5740  KMP_MB();
5741  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5742  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5743  (*pteam)->t.t_pkfn));
5744  }
5745 #if OMPT_SUPPORT
5746  if (ompt_enabled.enabled) {
5747  /* no frame set while outside task */
5748  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5749 
5750  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5751  }
5752 #endif
5753  /* join barrier after parallel region */
5754  __kmp_join_barrier(gtid);
5755  }
5756  }
5757  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5758 
5759 #if OMPT_SUPPORT
5760  if (ompt_enabled.ompt_callback_thread_end) {
5761  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5762  }
5763 #endif
5764 
5765  this_thr->th.th_task_team = NULL;
5766  /* run the destructors for the threadprivate data for this thread */
5767  __kmp_common_destroy_gtid(gtid);
5768 
5769  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5770  KMP_MB();
5771  return this_thr;
5772 }
5773 
5774 /* ------------------------------------------------------------------------ */
5775 
5776 void __kmp_internal_end_dest(void *specific_gtid) {
5777 #if KMP_COMPILER_ICC
5778 #pragma warning(push)
5779 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5780 // significant bits
5781 #endif
5782  // Make sure no significant bits are lost
5783  int gtid = (kmp_intptr_t)specific_gtid - 1;
5784 #if KMP_COMPILER_ICC
5785 #pragma warning(pop)
5786 #endif
5787 
5788  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5789  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5790  * this is because 0 is reserved for the nothing-stored case */
5791 
5792  /* josh: One reason for setting the gtid specific data even when it is being
5793  destroyed by pthread is to allow gtid lookup through thread specific data
5794  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5795  that gets executed in the call to __kmp_internal_end_thread, actually
5796  gets the gtid through the thread specific data. Setting it here seems
5797  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5798  to run smoothly.
5799  todo: get rid of this after we remove the dependence on
5800  __kmp_gtid_get_specific */
5801  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5802  __kmp_gtid_set_specific(gtid);
5803 #ifdef KMP_TDATA_GTID
5804  __kmp_gtid = gtid;
5805 #endif
5806  __kmp_internal_end_thread(gtid);
5807 }
5808 
5809 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5810 
5811 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5812 // destructors work perfectly, but in real libomp.so I have no evidence it is
5813 // ever called. However, -fini linker option in makefile.mk works fine.
5814 
5815 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5816  __kmp_internal_end_atexit();
5817 }
5818 
5819 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5820 
5821 #endif
5822 
5823 /* [Windows] josh: when the atexit handler is called, there may still be more
5824  than one thread alive */
5825 void __kmp_internal_end_atexit(void) {
5826  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5827  /* [Windows]
5828  josh: ideally, we want to completely shutdown the library in this atexit
5829  handler, but stat code that depends on thread specific data for gtid fails
5830  because that data becomes unavailable at some point during the shutdown, so
5831  we call __kmp_internal_end_thread instead. We should eventually remove the
5832  dependency on __kmp_get_specific_gtid in the stat code and use
5833  __kmp_internal_end_library to cleanly shutdown the library.
5834 
5835  // TODO: Can some of this comment about GVS be removed?
5836  I suspect that the offending stat code is executed when the calling thread
5837  tries to clean up a dead root thread's data structures, resulting in GVS
5838  code trying to close the GVS structures for that thread, but since the stat
5839  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5840  the calling thread is cleaning up itself instead of another thread, it get
5841  confused. This happens because allowing a thread to unregister and cleanup
5842  another thread is a recent modification for addressing an issue.
5843  Based on the current design (20050722), a thread may end up
5844  trying to unregister another thread only if thread death does not trigger
5845  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5846  thread specific data destructor function to detect thread death. For
5847  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5848  is nothing. Thus, the workaround is applicable only for Windows static
5849  stat library. */
5850  __kmp_internal_end_library(-1);
5851 #if KMP_OS_WINDOWS
5852  __kmp_close_console();
5853 #endif
5854 }
5855 
5856 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5857  // It is assumed __kmp_forkjoin_lock is acquired.
5858 
5859  int gtid;
5860 
5861  KMP_DEBUG_ASSERT(thread != NULL);
5862 
5863  gtid = thread->th.th_info.ds.ds_gtid;
5864 
5865  if (!is_root) {
5866  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5867  /* Assume the threads are at the fork barrier here */
5868  KA_TRACE(
5869  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5870  gtid));
5871  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5872  * (GEH) */
5873  ANNOTATE_HAPPENS_BEFORE(thread);
5874  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5875  __kmp_release_64(&flag);
5876  }
5877 
5878  // Terminate OS thread.
5879  __kmp_reap_worker(thread);
5880 
5881  // The thread was killed asynchronously. If it was actively
5882  // spinning in the thread pool, decrement the global count.
5883  //
5884  // There is a small timing hole here - if the worker thread was just waking
5885  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5886  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5887  // the global counter might not get updated.
5888  //
5889  // Currently, this can only happen as the library is unloaded,
5890  // so there are no harmful side effects.
5891  if (thread->th.th_active_in_pool) {
5892  thread->th.th_active_in_pool = FALSE;
5893  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5894  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5895  }
5896 
5897  // Decrement # of [worker] threads in the pool.
5898  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5899  --__kmp_thread_pool_nth;
5900  }
5901 
5902  __kmp_free_implicit_task(thread);
5903 
5904 // Free the fast memory for tasking
5905 #if USE_FAST_MEMORY
5906  __kmp_free_fast_memory(thread);
5907 #endif /* USE_FAST_MEMORY */
5908 
5909  __kmp_suspend_uninitialize_thread(thread);
5910 
5911  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5912  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5913 
5914  --__kmp_all_nth;
5915 // __kmp_nth was decremented when thread is added to the pool.
5916 
5917 #ifdef KMP_ADJUST_BLOCKTIME
5918  /* Adjust blocktime back to user setting or default if necessary */
5919  /* Middle initialization might never have occurred */
5920  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5921  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5922  if (__kmp_nth <= __kmp_avail_proc) {
5923  __kmp_zero_bt = FALSE;
5924  }
5925  }
5926 #endif /* KMP_ADJUST_BLOCKTIME */
5927 
5928  /* free the memory being used */
5929  if (__kmp_env_consistency_check) {
5930  if (thread->th.th_cons) {
5931  __kmp_free_cons_stack(thread->th.th_cons);
5932  thread->th.th_cons = NULL;
5933  }
5934  }
5935 
5936  if (thread->th.th_pri_common != NULL) {
5937  __kmp_free(thread->th.th_pri_common);
5938  thread->th.th_pri_common = NULL;
5939  }
5940 
5941  if (thread->th.th_task_state_memo_stack != NULL) {
5942  __kmp_free(thread->th.th_task_state_memo_stack);
5943  thread->th.th_task_state_memo_stack = NULL;
5944  }
5945 
5946 #if KMP_USE_BGET
5947  if (thread->th.th_local.bget_data != NULL) {
5948  __kmp_finalize_bget(thread);
5949  }
5950 #endif
5951 
5952 #if KMP_AFFINITY_SUPPORTED
5953  if (thread->th.th_affin_mask != NULL) {
5954  KMP_CPU_FREE(thread->th.th_affin_mask);
5955  thread->th.th_affin_mask = NULL;
5956  }
5957 #endif /* KMP_AFFINITY_SUPPORTED */
5958 
5959 #if KMP_USE_HIER_SCHED
5960  if (thread->th.th_hier_bar_data != NULL) {
5961  __kmp_free(thread->th.th_hier_bar_data);
5962  thread->th.th_hier_bar_data = NULL;
5963  }
5964 #endif
5965 
5966  __kmp_reap_team(thread->th.th_serial_team);
5967  thread->th.th_serial_team = NULL;
5968  __kmp_free(thread);
5969 
5970  KMP_MB();
5971 
5972 } // __kmp_reap_thread
5973 
5974 static void __kmp_internal_end(void) {
5975  int i;
5976 
5977  /* First, unregister the library */
5978  __kmp_unregister_library();
5979 
5980 #if KMP_OS_WINDOWS
5981  /* In Win static library, we can't tell when a root actually dies, so we
5982  reclaim the data structures for any root threads that have died but not
5983  unregistered themselves, in order to shut down cleanly.
5984  In Win dynamic library we also can't tell when a thread dies. */
5985  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5986 // dead roots
5987 #endif
5988 
5989  for (i = 0; i < __kmp_threads_capacity; i++)
5990  if (__kmp_root[i])
5991  if (__kmp_root[i]->r.r_active)
5992  break;
5993  KMP_MB(); /* Flush all pending memory write invalidates. */
5994  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5995 
5996  if (i < __kmp_threads_capacity) {
5997 #if KMP_USE_MONITOR
5998  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5999  KMP_MB(); /* Flush all pending memory write invalidates. */
6000 
6001  // Need to check that monitor was initialized before reaping it. If we are
6002  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6003  // __kmp_monitor will appear to contain valid data, but it is only valid in
6004  // the parent process, not the child.
6005  // New behavior (201008): instead of keying off of the flag
6006  // __kmp_init_parallel, the monitor thread creation is keyed off
6007  // of the new flag __kmp_init_monitor.
6008  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6009  if (TCR_4(__kmp_init_monitor)) {
6010  __kmp_reap_monitor(&__kmp_monitor);
6011  TCW_4(__kmp_init_monitor, 0);
6012  }
6013  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6014  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6015 #endif // KMP_USE_MONITOR
6016  } else {
6017 /* TODO move this to cleanup code */
6018 #ifdef KMP_DEBUG
6019  /* make sure that everything has properly ended */
6020  for (i = 0; i < __kmp_threads_capacity; i++) {
6021  if (__kmp_root[i]) {
6022  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6023  // there can be uber threads alive here
6024  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6025  }
6026  }
6027 #endif
6028 
6029  KMP_MB();
6030 
6031  // Reap the worker threads.
6032  // This is valid for now, but be careful if threads are reaped sooner.
6033  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6034  // Get the next thread from the pool.
6035  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6036  __kmp_thread_pool = thread->th.th_next_pool;
6037  // Reap it.
6038  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6039  thread->th.th_next_pool = NULL;
6040  thread->th.th_in_pool = FALSE;
6041  __kmp_reap_thread(thread, 0);
6042  }
6043  __kmp_thread_pool_insert_pt = NULL;
6044 
6045  // Reap teams.
6046  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6047  // Get the next team from the pool.
6048  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6049  __kmp_team_pool = team->t.t_next_pool;
6050  // Reap it.
6051  team->t.t_next_pool = NULL;
6052  __kmp_reap_team(team);
6053  }
6054 
6055  __kmp_reap_task_teams();
6056 
6057 #if KMP_OS_UNIX
6058  // Threads that are not reaped should not access any resources since they
6059  // are going to be deallocated soon, so the shutdown sequence should wait
6060  // until all threads either exit the final spin-waiting loop or begin
6061  // sleeping after the given blocktime.
6062  for (i = 0; i < __kmp_threads_capacity; i++) {
6063  kmp_info_t *thr = __kmp_threads[i];
6064  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6065  KMP_CPU_PAUSE();
6066  }
6067 #endif
6068 
6069  for (i = 0; i < __kmp_threads_capacity; ++i) {
6070  // TBD: Add some checking...
6071  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6072  }
6073 
6074  /* Make sure all threadprivate destructors get run by joining with all
6075  worker threads before resetting this flag */
6076  TCW_SYNC_4(__kmp_init_common, FALSE);
6077 
6078  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6079  KMP_MB();
6080 
6081 #if KMP_USE_MONITOR
6082  // See note above: One of the possible fixes for CQ138434 / CQ140126
6083  //
6084  // FIXME: push both code fragments down and CSE them?
6085  // push them into __kmp_cleanup() ?
6086  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6087  if (TCR_4(__kmp_init_monitor)) {
6088  __kmp_reap_monitor(&__kmp_monitor);
6089  TCW_4(__kmp_init_monitor, 0);
6090  }
6091  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6092  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6093 #endif
6094  } /* else !__kmp_global.t_active */
6095  TCW_4(__kmp_init_gtid, FALSE);
6096  KMP_MB(); /* Flush all pending memory write invalidates. */
6097 
6098  __kmp_cleanup();
6099 #if OMPT_SUPPORT
6100  ompt_fini();
6101 #endif
6102 }
6103 
6104 void __kmp_internal_end_library(int gtid_req) {
6105  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6106  /* this shouldn't be a race condition because __kmp_internal_end() is the
6107  only place to clear __kmp_serial_init */
6108  /* we'll check this later too, after we get the lock */
6109  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6110  // redundaant, because the next check will work in any case.
6111  if (__kmp_global.g.g_abort) {
6112  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6113  /* TODO abort? */
6114  return;
6115  }
6116  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6117  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6118  return;
6119  }
6120 
6121  KMP_MB(); /* Flush all pending memory write invalidates. */
6122 
6123  /* find out who we are and what we should do */
6124  {
6125  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6126  KA_TRACE(
6127  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6128  if (gtid == KMP_GTID_SHUTDOWN) {
6129  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6130  "already shutdown\n"));
6131  return;
6132  } else if (gtid == KMP_GTID_MONITOR) {
6133  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6134  "registered, or system shutdown\n"));
6135  return;
6136  } else if (gtid == KMP_GTID_DNE) {
6137  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6138  "shutdown\n"));
6139  /* we don't know who we are, but we may still shutdown the library */
6140  } else if (KMP_UBER_GTID(gtid)) {
6141  /* unregister ourselves as an uber thread. gtid is no longer valid */
6142  if (__kmp_root[gtid]->r.r_active) {
6143  __kmp_global.g.g_abort = -1;
6144  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6145  KA_TRACE(10,
6146  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6147  gtid));
6148  return;
6149  } else {
6150  KA_TRACE(
6151  10,
6152  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6153  __kmp_unregister_root_current_thread(gtid);
6154  }
6155  } else {
6156 /* worker threads may call this function through the atexit handler, if they
6157  * call exit() */
6158 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6159  TODO: do a thorough shutdown instead */
6160 #ifdef DUMP_DEBUG_ON_EXIT
6161  if (__kmp_debug_buf)
6162  __kmp_dump_debug_buffer();
6163 #endif
6164  return;
6165  }
6166  }
6167  /* synchronize the termination process */
6168  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6169 
6170  /* have we already finished */
6171  if (__kmp_global.g.g_abort) {
6172  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6173  /* TODO abort? */
6174  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6175  return;
6176  }
6177  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6178  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6179  return;
6180  }
6181 
6182  /* We need this lock to enforce mutex between this reading of
6183  __kmp_threads_capacity and the writing by __kmp_register_root.
6184  Alternatively, we can use a counter of roots that is atomically updated by
6185  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6186  __kmp_internal_end_*. */
6187  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6188 
6189  /* now we can safely conduct the actual termination */
6190  __kmp_internal_end();
6191 
6192  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6193  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6194 
6195  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6196 
6197 #ifdef DUMP_DEBUG_ON_EXIT
6198  if (__kmp_debug_buf)
6199  __kmp_dump_debug_buffer();
6200 #endif
6201 
6202 #if KMP_OS_WINDOWS
6203  __kmp_close_console();
6204 #endif
6205 
6206  __kmp_fini_allocator();
6207 
6208 } // __kmp_internal_end_library
6209 
6210 void __kmp_internal_end_thread(int gtid_req) {
6211  int i;
6212 
6213  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6214  /* this shouldn't be a race condition because __kmp_internal_end() is the
6215  * only place to clear __kmp_serial_init */
6216  /* we'll check this later too, after we get the lock */
6217  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6218  // redundant, because the next check will work in any case.
6219  if (__kmp_global.g.g_abort) {
6220  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6221  /* TODO abort? */
6222  return;
6223  }
6224  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6225  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6226  return;
6227  }
6228 
6229  KMP_MB(); /* Flush all pending memory write invalidates. */
6230 
6231  /* find out who we are and what we should do */
6232  {
6233  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6234  KA_TRACE(10,
6235  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6236  if (gtid == KMP_GTID_SHUTDOWN) {
6237  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6238  "already shutdown\n"));
6239  return;
6240  } else if (gtid == KMP_GTID_MONITOR) {
6241  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6242  "registered, or system shutdown\n"));
6243  return;
6244  } else if (gtid == KMP_GTID_DNE) {
6245  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6246  "shutdown\n"));
6247  return;
6248  /* we don't know who we are */
6249  } else if (KMP_UBER_GTID(gtid)) {
6250  /* unregister ourselves as an uber thread. gtid is no longer valid */
6251  if (__kmp_root[gtid]->r.r_active) {
6252  __kmp_global.g.g_abort = -1;
6253  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6254  KA_TRACE(10,
6255  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6256  gtid));
6257  return;
6258  } else {
6259  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6260  gtid));
6261  __kmp_unregister_root_current_thread(gtid);
6262  }
6263  } else {
6264  /* just a worker thread, let's leave */
6265  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6266 
6267  if (gtid >= 0) {
6268  __kmp_threads[gtid]->th.th_task_team = NULL;
6269  }
6270 
6271  KA_TRACE(10,
6272  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6273  gtid));
6274  return;
6275  }
6276  }
6277 #if KMP_DYNAMIC_LIB
6278  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6279  // thread, because we will better shutdown later in the library destructor.
6280  // The reason of this change is performance problem when non-openmp thread in
6281  // a loop forks and joins many openmp threads. We can save a lot of time
6282  // keeping worker threads alive until the program shutdown.
6283  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6284  // and Windows(DPD200287443) that occurs when using critical sections from
6285  // foreign threads.
6286  if (__kmp_pause_status != kmp_hard_paused) {
6287  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6288  return;
6289  }
6290 #endif
6291  /* synchronize the termination process */
6292  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6293 
6294  /* have we already finished */
6295  if (__kmp_global.g.g_abort) {
6296  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6297  /* TODO abort? */
6298  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6299  return;
6300  }
6301  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6302  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6303  return;
6304  }
6305 
6306  /* We need this lock to enforce mutex between this reading of
6307  __kmp_threads_capacity and the writing by __kmp_register_root.
6308  Alternatively, we can use a counter of roots that is atomically updated by
6309  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6310  __kmp_internal_end_*. */
6311 
6312  /* should we finish the run-time? are all siblings done? */
6313  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6314 
6315  for (i = 0; i < __kmp_threads_capacity; ++i) {
6316  if (KMP_UBER_GTID(i)) {
6317  KA_TRACE(
6318  10,
6319  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6320  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6321  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6322  return;
6323  }
6324  }
6325 
6326  /* now we can safely conduct the actual termination */
6327 
6328  __kmp_internal_end();
6329 
6330  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6331  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6332 
6333  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6334 
6335 #ifdef DUMP_DEBUG_ON_EXIT
6336  if (__kmp_debug_buf)
6337  __kmp_dump_debug_buffer();
6338 #endif
6339 } // __kmp_internal_end_thread
6340 
6341 // -----------------------------------------------------------------------------
6342 // Library registration stuff.
6343 
6344 static long __kmp_registration_flag = 0;
6345 // Random value used to indicate library initialization.
6346 static char *__kmp_registration_str = NULL;
6347 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6348 
6349 static inline char *__kmp_reg_status_name() {
6350  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6351  each thread. If registration and unregistration go in different threads
6352  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6353  env var can not be found, because the name will contain different pid. */
6354  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6355 } // __kmp_reg_status_get
6356 
6357 void __kmp_register_library_startup(void) {
6358 
6359  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6360  int done = 0;
6361  union {
6362  double dtime;
6363  long ltime;
6364  } time;
6365 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6366  __kmp_initialize_system_tick();
6367 #endif
6368  __kmp_read_system_time(&time.dtime);
6369  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6370  __kmp_registration_str =
6371  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6372  __kmp_registration_flag, KMP_LIBRARY_FILE);
6373 
6374  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6375  __kmp_registration_str));
6376 
6377  while (!done) {
6378 
6379  char *value = NULL; // Actual value of the environment variable.
6380 
6381  // Set environment variable, but do not overwrite if it is exist.
6382  __kmp_env_set(name, __kmp_registration_str, 0);
6383  // Check the variable is written.
6384  value = __kmp_env_get(name);
6385  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6386 
6387  done = 1; // Ok, environment variable set successfully, exit the loop.
6388 
6389  } else {
6390 
6391  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6392  // Check whether it alive or dead.
6393  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6394  char *tail = value;
6395  char *flag_addr_str = NULL;
6396  char *flag_val_str = NULL;
6397  char const *file_name = NULL;
6398  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6399  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6400  file_name = tail;
6401  if (tail != NULL) {
6402  long *flag_addr = 0;
6403  long flag_val = 0;
6404  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6405  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6406  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6407  // First, check whether environment-encoded address is mapped into
6408  // addr space.
6409  // If so, dereference it to see if it still has the right value.
6410  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6411  neighbor = 1;
6412  } else {
6413  // If not, then we know the other copy of the library is no longer
6414  // running.
6415  neighbor = 2;
6416  }
6417  }
6418  }
6419  switch (neighbor) {
6420  case 0: // Cannot parse environment variable -- neighbor status unknown.
6421  // Assume it is the incompatible format of future version of the
6422  // library. Assume the other library is alive.
6423  // WARN( ... ); // TODO: Issue a warning.
6424  file_name = "unknown library";
6425  // Attention! Falling to the next case. That's intentional.
6426  case 1: { // Neighbor is alive.
6427  // Check it is allowed.
6428  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6429  if (!__kmp_str_match_true(duplicate_ok)) {
6430  // That's not allowed. Issue fatal error.
6431  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6432  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6433  }
6434  KMP_INTERNAL_FREE(duplicate_ok);
6435  __kmp_duplicate_library_ok = 1;
6436  done = 1; // Exit the loop.
6437  } break;
6438  case 2: { // Neighbor is dead.
6439  // Clear the variable and try to register library again.
6440  __kmp_env_unset(name);
6441  } break;
6442  default: { KMP_DEBUG_ASSERT(0); } break;
6443  }
6444  }
6445  KMP_INTERNAL_FREE((void *)value);
6446  }
6447  KMP_INTERNAL_FREE((void *)name);
6448 
6449 } // func __kmp_register_library_startup
6450 
6451 void __kmp_unregister_library(void) {
6452 
6453  char *name = __kmp_reg_status_name();
6454  char *value = __kmp_env_get(name);
6455 
6456  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6457  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6458  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6459  // Ok, this is our variable. Delete it.
6460  __kmp_env_unset(name);
6461  }
6462 
6463  KMP_INTERNAL_FREE(__kmp_registration_str);
6464  KMP_INTERNAL_FREE(value);
6465  KMP_INTERNAL_FREE(name);
6466 
6467  __kmp_registration_flag = 0;
6468  __kmp_registration_str = NULL;
6469 
6470 } // __kmp_unregister_library
6471 
6472 // End of Library registration stuff.
6473 // -----------------------------------------------------------------------------
6474 
6475 #if KMP_MIC_SUPPORTED
6476 
6477 static void __kmp_check_mic_type() {
6478  kmp_cpuid_t cpuid_state = {0};
6479  kmp_cpuid_t *cs_p = &cpuid_state;
6480  __kmp_x86_cpuid(1, 0, cs_p);
6481  // We don't support mic1 at the moment
6482  if ((cs_p->eax & 0xff0) == 0xB10) {
6483  __kmp_mic_type = mic2;
6484  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6485  __kmp_mic_type = mic3;
6486  } else {
6487  __kmp_mic_type = non_mic;
6488  }
6489 }
6490 
6491 #endif /* KMP_MIC_SUPPORTED */
6492 
6493 static void __kmp_do_serial_initialize(void) {
6494  int i, gtid;
6495  int size;
6496 
6497  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6498 
6499  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6500  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6501  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6502  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6503  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6504 
6505 #if OMPT_SUPPORT
6506  ompt_pre_init();
6507 #endif
6508 
6509  __kmp_validate_locks();
6510 
6511  /* Initialize internal memory allocator */
6512  __kmp_init_allocator();
6513 
6514  /* Register the library startup via an environment variable and check to see
6515  whether another copy of the library is already registered. */
6516 
6517  __kmp_register_library_startup();
6518 
6519  /* TODO reinitialization of library */
6520  if (TCR_4(__kmp_global.g.g_done)) {
6521  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6522  }
6523 
6524  __kmp_global.g.g_abort = 0;
6525  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6526 
6527 /* initialize the locks */
6528 #if KMP_USE_ADAPTIVE_LOCKS
6529 #if KMP_DEBUG_ADAPTIVE_LOCKS
6530  __kmp_init_speculative_stats();
6531 #endif
6532 #endif
6533 #if KMP_STATS_ENABLED
6534  __kmp_stats_init();
6535 #endif
6536  __kmp_init_lock(&__kmp_global_lock);
6537  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6538  __kmp_init_lock(&__kmp_debug_lock);
6539  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6540  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6541  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6542  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6543  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6544  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6545  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6546  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6547  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6548  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6549  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6550  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6551  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6552  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6553  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6554 #if KMP_USE_MONITOR
6555  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6556 #endif
6557  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6558 
6559  /* conduct initialization and initial setup of configuration */
6560 
6561  __kmp_runtime_initialize();
6562 
6563 #if KMP_MIC_SUPPORTED
6564  __kmp_check_mic_type();
6565 #endif
6566 
6567 // Some global variable initialization moved here from kmp_env_initialize()
6568 #ifdef KMP_DEBUG
6569  kmp_diag = 0;
6570 #endif
6571  __kmp_abort_delay = 0;
6572 
6573  // From __kmp_init_dflt_team_nth()
6574  /* assume the entire machine will be used */
6575  __kmp_dflt_team_nth_ub = __kmp_xproc;
6576  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6577  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6578  }
6579  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6580  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6581  }
6582  __kmp_max_nth = __kmp_sys_max_nth;
6583  __kmp_cg_max_nth = __kmp_sys_max_nth;
6584  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6585  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6586  __kmp_teams_max_nth = __kmp_sys_max_nth;
6587  }
6588 
6589  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6590  // part
6591  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6592 #if KMP_USE_MONITOR
6593  __kmp_monitor_wakeups =
6594  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6595  __kmp_bt_intervals =
6596  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6597 #endif
6598  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6599  __kmp_library = library_throughput;
6600  // From KMP_SCHEDULE initialization
6601  __kmp_static = kmp_sch_static_balanced;
6602 // AC: do not use analytical here, because it is non-monotonous
6603 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6604 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6605 // need to repeat assignment
6606 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6607 // bit control and barrier method control parts
6608 #if KMP_FAST_REDUCTION_BARRIER
6609 #define kmp_reduction_barrier_gather_bb ((int)1)
6610 #define kmp_reduction_barrier_release_bb ((int)1)
6611 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6612 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6613 #endif // KMP_FAST_REDUCTION_BARRIER
6614  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6615  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6616  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6617  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6618  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6619 #if KMP_FAST_REDUCTION_BARRIER
6620  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6621  // lin_64 ): hyper,1
6622  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6623  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6624  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6625  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6626  }
6627 #endif // KMP_FAST_REDUCTION_BARRIER
6628  }
6629 #if KMP_FAST_REDUCTION_BARRIER
6630 #undef kmp_reduction_barrier_release_pat
6631 #undef kmp_reduction_barrier_gather_pat
6632 #undef kmp_reduction_barrier_release_bb
6633 #undef kmp_reduction_barrier_gather_bb
6634 #endif // KMP_FAST_REDUCTION_BARRIER
6635 #if KMP_MIC_SUPPORTED
6636  if (__kmp_mic_type == mic2) { // KNC
6637  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6638  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6639  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6640  1; // forkjoin release
6641  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6642  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6643  }
6644 #if KMP_FAST_REDUCTION_BARRIER
6645  if (__kmp_mic_type == mic2) { // KNC
6646  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6647  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6648  }
6649 #endif // KMP_FAST_REDUCTION_BARRIER
6650 #endif // KMP_MIC_SUPPORTED
6651 
6652 // From KMP_CHECKS initialization
6653 #ifdef KMP_DEBUG
6654  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6655 #else
6656  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6657 #endif
6658 
6659  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6660  __kmp_foreign_tp = TRUE;
6661 
6662  __kmp_global.g.g_dynamic = FALSE;
6663  __kmp_global.g.g_dynamic_mode = dynamic_default;
6664 
6665  __kmp_env_initialize(NULL);
6666 
6667 // Print all messages in message catalog for testing purposes.
6668 #ifdef KMP_DEBUG
6669  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6670  if (__kmp_str_match_true(val)) {
6671  kmp_str_buf_t buffer;
6672  __kmp_str_buf_init(&buffer);
6673  __kmp_i18n_dump_catalog(&buffer);
6674  __kmp_printf("%s", buffer.str);
6675  __kmp_str_buf_free(&buffer);
6676  }
6677  __kmp_env_free(&val);
6678 #endif
6679 
6680  __kmp_threads_capacity =
6681  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6682  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6683  __kmp_tp_capacity = __kmp_default_tp_capacity(
6684  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6685 
6686  // If the library is shut down properly, both pools must be NULL. Just in
6687  // case, set them to NULL -- some memory may leak, but subsequent code will
6688  // work even if pools are not freed.
6689  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6690  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6691  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6692  __kmp_thread_pool = NULL;
6693  __kmp_thread_pool_insert_pt = NULL;
6694  __kmp_team_pool = NULL;
6695 
6696  /* Allocate all of the variable sized records */
6697  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6698  * expandable */
6699  /* Since allocation is cache-aligned, just add extra padding at the end */
6700  size =
6701  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6702  CACHE_LINE;
6703  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6704  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6705  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6706 
6707  /* init thread counts */
6708  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6709  0); // Asserts fail if the library is reinitializing and
6710  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6711  __kmp_all_nth = 0;
6712  __kmp_nth = 0;
6713 
6714  /* setup the uber master thread and hierarchy */
6715  gtid = __kmp_register_root(TRUE);
6716  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6717  KMP_ASSERT(KMP_UBER_GTID(gtid));
6718  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6719 
6720  KMP_MB(); /* Flush all pending memory write invalidates. */
6721 
6722  __kmp_common_initialize();
6723 
6724 #if KMP_OS_UNIX
6725  /* invoke the child fork handler */
6726  __kmp_register_atfork();
6727 #endif
6728 
6729 #if !KMP_DYNAMIC_LIB
6730  {
6731  /* Invoke the exit handler when the program finishes, only for static
6732  library. For dynamic library, we already have _fini and DllMain. */
6733  int rc = atexit(__kmp_internal_end_atexit);
6734  if (rc != 0) {
6735  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6736  __kmp_msg_null);
6737  }
6738  }
6739 #endif
6740 
6741 #if KMP_HANDLE_SIGNALS
6742 #if KMP_OS_UNIX
6743  /* NOTE: make sure that this is called before the user installs their own
6744  signal handlers so that the user handlers are called first. this way they
6745  can return false, not call our handler, avoid terminating the library, and
6746  continue execution where they left off. */
6747  __kmp_install_signals(FALSE);
6748 #endif /* KMP_OS_UNIX */
6749 #if KMP_OS_WINDOWS
6750  __kmp_install_signals(TRUE);
6751 #endif /* KMP_OS_WINDOWS */
6752 #endif
6753 
6754  /* we have finished the serial initialization */
6755  __kmp_init_counter++;
6756 
6757  __kmp_init_serial = TRUE;
6758 
6759  if (__kmp_settings) {
6760  __kmp_env_print();
6761  }
6762 
6763 #if OMP_40_ENABLED
6764  if (__kmp_display_env || __kmp_display_env_verbose) {
6765  __kmp_env_print_2();
6766  }
6767 #endif // OMP_40_ENABLED
6768 
6769 #if OMPT_SUPPORT
6770  ompt_post_init();
6771 #endif
6772 
6773  KMP_MB();
6774 
6775  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6776 }
6777 
6778 void __kmp_serial_initialize(void) {
6779  if (__kmp_init_serial) {
6780  return;
6781  }
6782  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6783  if (__kmp_init_serial) {
6784  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6785  return;
6786  }
6787  __kmp_do_serial_initialize();
6788  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6789 }
6790 
6791 static void __kmp_do_middle_initialize(void) {
6792  int i, j;
6793  int prev_dflt_team_nth;
6794 
6795  if (!__kmp_init_serial) {
6796  __kmp_do_serial_initialize();
6797  }
6798 
6799  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6800 
6801  // Save the previous value for the __kmp_dflt_team_nth so that
6802  // we can avoid some reinitialization if it hasn't changed.
6803  prev_dflt_team_nth = __kmp_dflt_team_nth;
6804 
6805 #if KMP_AFFINITY_SUPPORTED
6806  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6807  // number of cores on the machine.
6808  __kmp_affinity_initialize();
6809 
6810  // Run through the __kmp_threads array and set the affinity mask
6811  // for each root thread that is currently registered with the RTL.
6812  for (i = 0; i < __kmp_threads_capacity; i++) {
6813  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6814  __kmp_affinity_set_init_mask(i, TRUE);
6815  }
6816  }
6817 #endif /* KMP_AFFINITY_SUPPORTED */
6818 
6819  KMP_ASSERT(__kmp_xproc > 0);
6820  if (__kmp_avail_proc == 0) {
6821  __kmp_avail_proc = __kmp_xproc;
6822  }
6823 
6824  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6825  // correct them now
6826  j = 0;
6827  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6828  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6829  __kmp_avail_proc;
6830  j++;
6831  }
6832 
6833  if (__kmp_dflt_team_nth == 0) {
6834 #ifdef KMP_DFLT_NTH_CORES
6835  // Default #threads = #cores
6836  __kmp_dflt_team_nth = __kmp_ncores;
6837  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6838  "__kmp_ncores (%d)\n",
6839  __kmp_dflt_team_nth));
6840 #else
6841  // Default #threads = #available OS procs
6842  __kmp_dflt_team_nth = __kmp_avail_proc;
6843  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6844  "__kmp_avail_proc(%d)\n",
6845  __kmp_dflt_team_nth));
6846 #endif /* KMP_DFLT_NTH_CORES */
6847  }
6848 
6849  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6850  __kmp_dflt_team_nth = KMP_MIN_NTH;
6851  }
6852  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6853  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6854  }
6855 
6856  // There's no harm in continuing if the following check fails,
6857  // but it indicates an error in the previous logic.
6858  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6859 
6860  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6861  // Run through the __kmp_threads array and set the num threads icv for each
6862  // root thread that is currently registered with the RTL (which has not
6863  // already explicitly set its nthreads-var with a call to
6864  // omp_set_num_threads()).
6865  for (i = 0; i < __kmp_threads_capacity; i++) {
6866  kmp_info_t *thread = __kmp_threads[i];
6867  if (thread == NULL)
6868  continue;
6869  if (thread->th.th_current_task->td_icvs.nproc != 0)
6870  continue;
6871 
6872  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6873  }
6874  }
6875  KA_TRACE(
6876  20,
6877  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6878  __kmp_dflt_team_nth));
6879 
6880 #ifdef KMP_ADJUST_BLOCKTIME
6881  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6882  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6883  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6884  if (__kmp_nth > __kmp_avail_proc) {
6885  __kmp_zero_bt = TRUE;
6886  }
6887  }
6888 #endif /* KMP_ADJUST_BLOCKTIME */
6889 
6890  /* we have finished middle initialization */
6891  TCW_SYNC_4(__kmp_init_middle, TRUE);
6892 
6893  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6894 }
6895 
6896 void __kmp_middle_initialize(void) {
6897  if (__kmp_init_middle) {
6898  return;
6899  }
6900  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6901  if (__kmp_init_middle) {
6902  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6903  return;
6904  }
6905  __kmp_do_middle_initialize();
6906  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6907 }
6908 
6909 void __kmp_parallel_initialize(void) {
6910  int gtid = __kmp_entry_gtid(); // this might be a new root
6911 
6912  /* synchronize parallel initialization (for sibling) */
6913  if (TCR_4(__kmp_init_parallel))
6914  return;
6915  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6916  if (TCR_4(__kmp_init_parallel)) {
6917  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6918  return;
6919  }
6920 
6921  /* TODO reinitialization after we have already shut down */
6922  if (TCR_4(__kmp_global.g.g_done)) {
6923  KA_TRACE(
6924  10,
6925  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6926  __kmp_infinite_loop();
6927  }
6928 
6929  /* jc: The lock __kmp_initz_lock is already held, so calling
6930  __kmp_serial_initialize would cause a deadlock. So we call
6931  __kmp_do_serial_initialize directly. */
6932  if (!__kmp_init_middle) {
6933  __kmp_do_middle_initialize();
6934  }
6935 
6936 #if OMP_50_ENABLED
6937  __kmp_resume_if_hard_paused();
6938 #endif
6939 
6940  /* begin initialization */
6941  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6942  KMP_ASSERT(KMP_UBER_GTID(gtid));
6943 
6944 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6945  // Save the FP control regs.
6946  // Worker threads will set theirs to these values at thread startup.
6947  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6948  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6949  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6950 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6951 
6952 #if KMP_OS_UNIX
6953 #if KMP_HANDLE_SIGNALS
6954  /* must be after __kmp_serial_initialize */
6955  __kmp_install_signals(TRUE);
6956 #endif
6957 #endif
6958 
6959  __kmp_suspend_initialize();
6960 
6961 #if defined(USE_LOAD_BALANCE)
6962  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6963  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6964  }
6965 #else
6966  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6967  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6968  }
6969 #endif
6970 
6971  if (__kmp_version) {
6972  __kmp_print_version_2();
6973  }
6974 
6975  /* we have finished parallel initialization */
6976  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6977 
6978  KMP_MB();
6979  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6980 
6981  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6982 }
6983 
6984 /* ------------------------------------------------------------------------ */
6985 
6986 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6987  kmp_team_t *team) {
6988  kmp_disp_t *dispatch;
6989 
6990  KMP_MB();
6991 
6992  /* none of the threads have encountered any constructs, yet. */
6993  this_thr->th.th_local.this_construct = 0;
6994 #if KMP_CACHE_MANAGE
6995  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6996 #endif /* KMP_CACHE_MANAGE */
6997  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6998  KMP_DEBUG_ASSERT(dispatch);
6999  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7000  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7001  // this_thr->th.th_info.ds.ds_tid ] );
7002 
7003  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7004 #if OMP_45_ENABLED
7005  dispatch->th_doacross_buf_idx =
7006  0; /* reset the doacross dispatch buffer counter */
7007 #endif
7008  if (__kmp_env_consistency_check)
7009  __kmp_push_parallel(gtid, team->t.t_ident);
7010 
7011  KMP_MB(); /* Flush all pending memory write invalidates. */
7012 }
7013 
7014 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7015  kmp_team_t *team) {
7016  if (__kmp_env_consistency_check)
7017  __kmp_pop_parallel(gtid, team->t.t_ident);
7018 
7019  __kmp_finish_implicit_task(this_thr);
7020 }
7021 
7022 int __kmp_invoke_task_func(int gtid) {
7023  int rc;
7024  int tid = __kmp_tid_from_gtid(gtid);
7025  kmp_info_t *this_thr = __kmp_threads[gtid];
7026  kmp_team_t *team = this_thr->th.th_team;
7027 
7028  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7029 #if USE_ITT_BUILD
7030  if (__itt_stack_caller_create_ptr) {
7031  __kmp_itt_stack_callee_enter(
7032  (__itt_caller)
7033  team->t.t_stack_id); // inform ittnotify about entering user's code
7034  }
7035 #endif /* USE_ITT_BUILD */
7036 #if INCLUDE_SSC_MARKS
7037  SSC_MARK_INVOKING();
7038 #endif
7039 
7040 #if OMPT_SUPPORT
7041  void *dummy;
7042  void **exit_runtime_p;
7043  ompt_data_t *my_task_data;
7044  ompt_data_t *my_parallel_data;
7045  int ompt_team_size;
7046 
7047  if (ompt_enabled.enabled) {
7048  exit_runtime_p = &(
7049  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7050  } else {
7051  exit_runtime_p = &dummy;
7052  }
7053 
7054  my_task_data =
7055  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7056  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7057  if (ompt_enabled.ompt_callback_implicit_task) {
7058  ompt_team_size = team->t.t_nproc;
7059  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7060  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7061  __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7062  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7063  }
7064 #endif
7065 
7066  {
7067  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
7068  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
7069  rc =
7070  __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7071  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7072 #if OMPT_SUPPORT
7073  ,
7074  exit_runtime_p
7075 #endif
7076  );
7077 #if OMPT_SUPPORT
7078  *exit_runtime_p = NULL;
7079 #endif
7080  }
7081 
7082 #if USE_ITT_BUILD
7083  if (__itt_stack_caller_create_ptr) {
7084  __kmp_itt_stack_callee_leave(
7085  (__itt_caller)
7086  team->t.t_stack_id); // inform ittnotify about leaving user's code
7087  }
7088 #endif /* USE_ITT_BUILD */
7089  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7090 
7091  return rc;
7092 }
7093 
7094 #if OMP_40_ENABLED
7095 void __kmp_teams_master(int gtid) {
7096  // This routine is called by all master threads in teams construct
7097  kmp_info_t *thr = __kmp_threads[gtid];
7098  kmp_team_t *team = thr->th.th_team;
7099  ident_t *loc = team->t.t_ident;
7100  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7101  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7102  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7103  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7104  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7105 // Launch league of teams now, but not let workers execute
7106 // (they hang on fork barrier until next parallel)
7107 #if INCLUDE_SSC_MARKS
7108  SSC_MARK_FORKING();
7109 #endif
7110  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7111  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7112  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7113 #if INCLUDE_SSC_MARKS
7114  SSC_MARK_JOINING();
7115 #endif
7116 
7117  // AC: last parameter "1" eliminates join barrier which won't work because
7118  // worker threads are in a fork barrier waiting for more parallel regions
7119  __kmp_join_call(loc, gtid
7120 #if OMPT_SUPPORT
7121  ,
7122  fork_context_intel
7123 #endif
7124  ,
7125  1);
7126 }
7127 
7128 int __kmp_invoke_teams_master(int gtid) {
7129  kmp_info_t *this_thr = __kmp_threads[gtid];
7130  kmp_team_t *team = this_thr->th.th_team;
7131 #if KMP_DEBUG
7132  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7133  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7134  (void *)__kmp_teams_master);
7135 #endif
7136  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7137  __kmp_teams_master(gtid);
7138  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7139  return 1;
7140 }
7141 #endif /* OMP_40_ENABLED */
7142 
7143 /* this sets the requested number of threads for the next parallel region
7144  encountered by this team. since this should be enclosed in the forkjoin
7145  critical section it should avoid race conditions with assymmetrical nested
7146  parallelism */
7147 
7148 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7149  kmp_info_t *thr = __kmp_threads[gtid];
7150 
7151  if (num_threads > 0)
7152  thr->th.th_set_nproc = num_threads;
7153 }
7154 
7155 #if OMP_40_ENABLED
7156 
7157 /* this sets the requested number of teams for the teams region and/or
7158  the number of threads for the next parallel region encountered */
7159 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7160  int num_threads) {
7161  kmp_info_t *thr = __kmp_threads[gtid];
7162  KMP_DEBUG_ASSERT(num_teams >= 0);
7163  KMP_DEBUG_ASSERT(num_threads >= 0);
7164 
7165  if (num_teams == 0)
7166  num_teams = 1; // default number of teams is 1.
7167  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7168  if (!__kmp_reserve_warn) {
7169  __kmp_reserve_warn = 1;
7170  __kmp_msg(kmp_ms_warning,
7171  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7172  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7173  }
7174  num_teams = __kmp_teams_max_nth;
7175  }
7176  // Set number of teams (number of threads in the outer "parallel" of the
7177  // teams)
7178  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7179 
7180  // Remember the number of threads for inner parallel regions
7181  if (num_threads == 0) {
7182  if (!TCR_4(__kmp_init_middle))
7183  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7184  num_threads = __kmp_avail_proc / num_teams;
7185  if (num_teams * num_threads > __kmp_teams_max_nth) {
7186  // adjust num_threads w/o warning as it is not user setting
7187  num_threads = __kmp_teams_max_nth / num_teams;
7188  }
7189  } else {
7190  if (num_teams * num_threads > __kmp_teams_max_nth) {
7191  int new_threads = __kmp_teams_max_nth / num_teams;
7192  if (!__kmp_reserve_warn) { // user asked for too many threads
7193  __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7194  __kmp_msg(kmp_ms_warning,
7195  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7196  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7197  }
7198  num_threads = new_threads;
7199  }
7200  }
7201  thr->th.th_teams_size.nth = num_threads;
7202 }
7203 
7204 // Set the proc_bind var to use in the following parallel region.
7205 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7206  kmp_info_t *thr = __kmp_threads[gtid];
7207  thr->th.th_set_proc_bind = proc_bind;
7208 }
7209 
7210 #endif /* OMP_40_ENABLED */
7211 
7212 /* Launch the worker threads into the microtask. */
7213 
7214 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7215  kmp_info_t *this_thr = __kmp_threads[gtid];
7216 
7217 #ifdef KMP_DEBUG
7218  int f;
7219 #endif /* KMP_DEBUG */
7220 
7221  KMP_DEBUG_ASSERT(team);
7222  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7223  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7224  KMP_MB(); /* Flush all pending memory write invalidates. */
7225 
7226  team->t.t_construct = 0; /* no single directives seen yet */
7227  team->t.t_ordered.dt.t_value =
7228  0; /* thread 0 enters the ordered section first */
7229 
7230  /* Reset the identifiers on the dispatch buffer */
7231  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7232  if (team->t.t_max_nproc > 1) {
7233  int i;
7234  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7235  team->t.t_disp_buffer[i].buffer_index = i;
7236 #if OMP_45_ENABLED
7237  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7238 #endif
7239  }
7240  } else {
7241  team->t.t_disp_buffer[0].buffer_index = 0;
7242 #if OMP_45_ENABLED
7243  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7244 #endif
7245  }
7246 
7247  KMP_MB(); /* Flush all pending memory write invalidates. */
7248  KMP_ASSERT(this_thr->th.th_team == team);
7249 
7250 #ifdef KMP_DEBUG
7251  for (f = 0; f < team->t.t_nproc; f++) {
7252  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7253  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7254  }
7255 #endif /* KMP_DEBUG */
7256 
7257  /* release the worker threads so they may begin working */
7258  __kmp_fork_barrier(gtid, 0);
7259 }
7260 
7261 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7262  kmp_info_t *this_thr = __kmp_threads[gtid];
7263 
7264  KMP_DEBUG_ASSERT(team);
7265  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7266  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7267  KMP_MB(); /* Flush all pending memory write invalidates. */
7268 
7269 /* Join barrier after fork */
7270 
7271 #ifdef KMP_DEBUG
7272  if (__kmp_threads[gtid] &&
7273  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7274  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7275  __kmp_threads[gtid]);
7276  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7277  "team->t.t_nproc=%d\n",
7278  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7279  team->t.t_nproc);
7280  __kmp_print_structure();
7281  }
7282  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7283  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7284 #endif /* KMP_DEBUG */
7285 
7286  __kmp_join_barrier(gtid); /* wait for everyone */
7287 #if OMPT_SUPPORT
7288  if (ompt_enabled.enabled &&
7289  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7290  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7291  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7292  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7293 #if OMPT_OPTIONAL
7294  void *codeptr = NULL;
7295  if (KMP_MASTER_TID(ds_tid) &&
7296  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7297  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7298  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7299 
7300  if (ompt_enabled.ompt_callback_sync_region_wait) {
7301  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7302  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7303  }
7304  if (ompt_enabled.ompt_callback_sync_region) {
7305  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7306  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7307  }
7308 #endif
7309  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7310  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7311  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7312  }
7313  }
7314 #endif
7315 
7316  KMP_MB(); /* Flush all pending memory write invalidates. */
7317  KMP_ASSERT(this_thr->th.th_team == team);
7318 }
7319 
7320 /* ------------------------------------------------------------------------ */
7321 
7322 #ifdef USE_LOAD_BALANCE
7323 
7324 // Return the worker threads actively spinning in the hot team, if we
7325 // are at the outermost level of parallelism. Otherwise, return 0.
7326 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7327  int i;
7328  int retval;
7329  kmp_team_t *hot_team;
7330 
7331  if (root->r.r_active) {
7332  return 0;
7333  }
7334  hot_team = root->r.r_hot_team;
7335  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7336  return hot_team->t.t_nproc - 1; // Don't count master thread
7337  }
7338 
7339  // Skip the master thread - it is accounted for elsewhere.
7340  retval = 0;
7341  for (i = 1; i < hot_team->t.t_nproc; i++) {
7342  if (hot_team->t.t_threads[i]->th.th_active) {
7343  retval++;
7344  }
7345  }
7346  return retval;
7347 }
7348 
7349 // Perform an automatic adjustment to the number of
7350 // threads used by the next parallel region.
7351 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7352  int retval;
7353  int pool_active;
7354  int hot_team_active;
7355  int team_curr_active;
7356  int system_active;
7357 
7358  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7359  set_nproc));
7360  KMP_DEBUG_ASSERT(root);
7361  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7362  ->th.th_current_task->td_icvs.dynamic == TRUE);
7363  KMP_DEBUG_ASSERT(set_nproc > 1);
7364 
7365  if (set_nproc == 1) {
7366  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7367  return 1;
7368  }
7369 
7370  // Threads that are active in the thread pool, active in the hot team for this
7371  // particular root (if we are at the outer par level), and the currently
7372  // executing thread (to become the master) are available to add to the new
7373  // team, but are currently contributing to the system load, and must be
7374  // accounted for.
7375  pool_active = __kmp_thread_pool_active_nth;
7376  hot_team_active = __kmp_active_hot_team_nproc(root);
7377  team_curr_active = pool_active + hot_team_active + 1;
7378 
7379  // Check the system load.
7380  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7381  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7382  "hot team active = %d\n",
7383  system_active, pool_active, hot_team_active));
7384 
7385  if (system_active < 0) {
7386  // There was an error reading the necessary info from /proc, so use the
7387  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7388  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7389  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7390  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7391 
7392  // Make this call behave like the thread limit algorithm.
7393  retval = __kmp_avail_proc - __kmp_nth +
7394  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7395  if (retval > set_nproc) {
7396  retval = set_nproc;
7397  }
7398  if (retval < KMP_MIN_NTH) {
7399  retval = KMP_MIN_NTH;
7400  }
7401 
7402  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7403  retval));
7404  return retval;
7405  }
7406 
7407  // There is a slight delay in the load balance algorithm in detecting new
7408  // running procs. The real system load at this instant should be at least as
7409  // large as the #active omp thread that are available to add to the team.
7410  if (system_active < team_curr_active) {
7411  system_active = team_curr_active;
7412  }
7413  retval = __kmp_avail_proc - system_active + team_curr_active;
7414  if (retval > set_nproc) {
7415  retval = set_nproc;
7416  }
7417  if (retval < KMP_MIN_NTH) {
7418  retval = KMP_MIN_NTH;
7419  }
7420 
7421  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7422  return retval;
7423 } // __kmp_load_balance_nproc()
7424 
7425 #endif /* USE_LOAD_BALANCE */
7426 
7427 /* ------------------------------------------------------------------------ */
7428 
7429 /* NOTE: this is called with the __kmp_init_lock held */
7430 void __kmp_cleanup(void) {
7431  int f;
7432 
7433  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7434 
7435  if (TCR_4(__kmp_init_parallel)) {
7436 #if KMP_HANDLE_SIGNALS
7437  __kmp_remove_signals();
7438 #endif
7439  TCW_4(__kmp_init_parallel, FALSE);
7440  }
7441 
7442  if (TCR_4(__kmp_init_middle)) {
7443 #if KMP_AFFINITY_SUPPORTED
7444  __kmp_affinity_uninitialize();
7445 #endif /* KMP_AFFINITY_SUPPORTED */
7446  __kmp_cleanup_hierarchy();
7447  TCW_4(__kmp_init_middle, FALSE);
7448  }
7449 
7450  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7451 
7452  if (__kmp_init_serial) {
7453  __kmp_runtime_destroy();
7454  __kmp_init_serial = FALSE;
7455  }
7456 
7457  __kmp_cleanup_threadprivate_caches();
7458 
7459  for (f = 0; f < __kmp_threads_capacity; f++) {
7460  if (__kmp_root[f] != NULL) {
7461  __kmp_free(__kmp_root[f]);
7462  __kmp_root[f] = NULL;
7463  }
7464  }
7465  __kmp_free(__kmp_threads);
7466  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7467  // there is no need in freeing __kmp_root.
7468  __kmp_threads = NULL;
7469  __kmp_root = NULL;
7470  __kmp_threads_capacity = 0;
7471 
7472 #if KMP_USE_DYNAMIC_LOCK
7473  __kmp_cleanup_indirect_user_locks();
7474 #else
7475  __kmp_cleanup_user_locks();
7476 #endif
7477 
7478 #if KMP_AFFINITY_SUPPORTED
7479  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7480  __kmp_cpuinfo_file = NULL;
7481 #endif /* KMP_AFFINITY_SUPPORTED */
7482 
7483 #if KMP_USE_ADAPTIVE_LOCKS
7484 #if KMP_DEBUG_ADAPTIVE_LOCKS
7485  __kmp_print_speculative_stats();
7486 #endif
7487 #endif
7488  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7489  __kmp_nested_nth.nth = NULL;
7490  __kmp_nested_nth.size = 0;
7491  __kmp_nested_nth.used = 0;
7492  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7493  __kmp_nested_proc_bind.bind_types = NULL;
7494  __kmp_nested_proc_bind.size = 0;
7495  __kmp_nested_proc_bind.used = 0;
7496 #if OMP_50_ENABLED
7497  if (__kmp_affinity_format) {
7498  KMP_INTERNAL_FREE(__kmp_affinity_format);
7499  __kmp_affinity_format = NULL;
7500  }
7501 #endif
7502 
7503  __kmp_i18n_catclose();
7504 
7505 #if KMP_USE_HIER_SCHED
7506  __kmp_hier_scheds.deallocate();
7507 #endif
7508 
7509 #if KMP_STATS_ENABLED
7510  __kmp_stats_fini();
7511 #endif
7512 
7513  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7514 }
7515 
7516 /* ------------------------------------------------------------------------ */
7517 
7518 int __kmp_ignore_mppbeg(void) {
7519  char *env;
7520 
7521  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7522  if (__kmp_str_match_false(env))
7523  return FALSE;
7524  }
7525  // By default __kmpc_begin() is no-op.
7526  return TRUE;
7527 }
7528 
7529 int __kmp_ignore_mppend(void) {
7530  char *env;
7531 
7532  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7533  if (__kmp_str_match_false(env))
7534  return FALSE;
7535  }
7536  // By default __kmpc_end() is no-op.
7537  return TRUE;
7538 }
7539 
7540 void __kmp_internal_begin(void) {
7541  int gtid;
7542  kmp_root_t *root;
7543 
7544  /* this is a very important step as it will register new sibling threads
7545  and assign these new uber threads a new gtid */
7546  gtid = __kmp_entry_gtid();
7547  root = __kmp_threads[gtid]->th.th_root;
7548  KMP_ASSERT(KMP_UBER_GTID(gtid));
7549 
7550  if (root->r.r_begin)
7551  return;
7552  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7553  if (root->r.r_begin) {
7554  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7555  return;
7556  }
7557 
7558  root->r.r_begin = TRUE;
7559 
7560  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7561 }
7562 
7563 /* ------------------------------------------------------------------------ */
7564 
7565 void __kmp_user_set_library(enum library_type arg) {
7566  int gtid;
7567  kmp_root_t *root;
7568  kmp_info_t *thread;
7569 
7570  /* first, make sure we are initialized so we can get our gtid */
7571 
7572  gtid = __kmp_entry_gtid();
7573  thread = __kmp_threads[gtid];
7574 
7575  root = thread->th.th_root;
7576 
7577  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7578  library_serial));
7579  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7580  thread */
7581  KMP_WARNING(SetLibraryIncorrectCall);
7582  return;
7583  }
7584 
7585  switch (arg) {
7586  case library_serial:
7587  thread->th.th_set_nproc = 0;
7588  set__nproc(thread, 1);
7589  break;
7590  case library_turnaround:
7591  thread->th.th_set_nproc = 0;
7592  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7593  : __kmp_dflt_team_nth_ub);
7594  break;
7595  case library_throughput:
7596  thread->th.th_set_nproc = 0;
7597  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7598  : __kmp_dflt_team_nth_ub);
7599  break;
7600  default:
7601  KMP_FATAL(UnknownLibraryType, arg);
7602  }
7603 
7604  __kmp_aux_set_library(arg);
7605 }
7606 
7607 void __kmp_aux_set_stacksize(size_t arg) {
7608  if (!__kmp_init_serial)
7609  __kmp_serial_initialize();
7610 
7611 #if KMP_OS_DARWIN
7612  if (arg & (0x1000 - 1)) {
7613  arg &= ~(0x1000 - 1);
7614  if (arg + 0x1000) /* check for overflow if we round up */
7615  arg += 0x1000;
7616  }
7617 #endif
7618  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7619 
7620  /* only change the default stacksize before the first parallel region */
7621  if (!TCR_4(__kmp_init_parallel)) {
7622  size_t value = arg; /* argument is in bytes */
7623 
7624  if (value < __kmp_sys_min_stksize)
7625  value = __kmp_sys_min_stksize;
7626  else if (value > KMP_MAX_STKSIZE)
7627  value = KMP_MAX_STKSIZE;
7628 
7629  __kmp_stksize = value;
7630 
7631  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7632  }
7633 
7634  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7635 }
7636 
7637 /* set the behaviour of the runtime library */
7638 /* TODO this can cause some odd behaviour with sibling parallelism... */
7639 void __kmp_aux_set_library(enum library_type arg) {
7640  __kmp_library = arg;
7641 
7642  switch (__kmp_library) {
7643  case library_serial: {
7644  KMP_INFORM(LibraryIsSerial);
7645  (void)__kmp_change_library(TRUE);
7646  } break;
7647  case library_turnaround:
7648  (void)__kmp_change_library(TRUE);
7649  break;
7650  case library_throughput:
7651  (void)__kmp_change_library(FALSE);
7652  break;
7653  default:
7654  KMP_FATAL(UnknownLibraryType, arg);
7655  }
7656 }
7657 
7658 /* Getting team information common for all team API */
7659 // Returns NULL if not in teams construct
7660 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7661  kmp_info_t *thr = __kmp_entry_thread();
7662  teams_serialized = 0;
7663  if (thr->th.th_teams_microtask) {
7664  kmp_team_t *team = thr->th.th_team;
7665  int tlevel = thr->th.th_teams_level; // the level of the teams construct
7666  int ii = team->t.t_level;
7667  teams_serialized = team->t.t_serialized;
7668  int level = tlevel + 1;
7669  KMP_DEBUG_ASSERT(ii >= tlevel);
7670  while (ii > level) {
7671  for (teams_serialized = team->t.t_serialized;
7672  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7673  }
7674  if (team->t.t_serialized && (!teams_serialized)) {
7675  team = team->t.t_parent;
7676  continue;
7677  }
7678  if (ii > level) {
7679  team = team->t.t_parent;
7680  ii--;
7681  }
7682  }
7683  return team;
7684  }
7685  return NULL;
7686 }
7687 
7688 int __kmp_aux_get_team_num() {
7689  int serialized;
7690  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7691  if (team) {
7692  if (serialized > 1) {
7693  return 0; // teams region is serialized ( 1 team of 1 thread ).
7694  } else {
7695  return team->t.t_master_tid;
7696  }
7697  }
7698  return 0;
7699 }
7700 
7701 int __kmp_aux_get_num_teams() {
7702  int serialized;
7703  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7704  if (team) {
7705  if (serialized > 1) {
7706  return 1;
7707  } else {
7708  return team->t.t_parent->t.t_nproc;
7709  }
7710  }
7711  return 1;
7712 }
7713 
7714 /* ------------------------------------------------------------------------ */
7715 
7716 #if OMP_50_ENABLED
7717 /*
7718  * Affinity Format Parser
7719  *
7720  * Field is in form of: %[[[0].]size]type
7721  * % and type are required (%% means print a literal '%')
7722  * type is either single char or long name surrounded by {},
7723  * e.g., N or {num_threads}
7724  * 0 => leading zeros
7725  * . => right justified when size is specified
7726  * by default output is left justified
7727  * size is the *minimum* field length
7728  * All other characters are printed as is
7729  *
7730  * Available field types:
7731  * L {thread_level} - omp_get_level()
7732  * n {thread_num} - omp_get_thread_num()
7733  * h {host} - name of host machine
7734  * P {process_id} - process id (integer)
7735  * T {thread_identifier} - native thread identifier (integer)
7736  * N {num_threads} - omp_get_num_threads()
7737  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7738  * a {thread_affinity} - comma separated list of integers or integer ranges
7739  * (values of affinity mask)
7740  *
7741  * Implementation-specific field types can be added
7742  * If a type is unknown, print "undefined"
7743 */
7744 
7745 // Structure holding the short name, long name, and corresponding data type
7746 // for snprintf. A table of these will represent the entire valid keyword
7747 // field types.
7748 typedef struct kmp_affinity_format_field_t {
7749  char short_name; // from spec e.g., L -> thread level
7750  const char *long_name; // from spec thread_level -> thread level
7751  char field_format; // data type for snprintf (typically 'd' or 's'
7752  // for integer or string)
7753 } kmp_affinity_format_field_t;
7754 
7755 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7756 #if KMP_AFFINITY_SUPPORTED
7757  {'A', "thread_affinity", 's'},
7758 #endif
7759  {'t', "team_num", 'd'},
7760  {'T', "num_teams", 'd'},
7761  {'L', "nesting_level", 'd'},
7762  {'n', "thread_num", 'd'},
7763  {'N', "num_threads", 'd'},
7764  {'a', "ancestor_tnum", 'd'},
7765  {'H', "host", 's'},
7766  {'P', "process_id", 'd'},
7767  {'i', "native_thread_id", 'd'}};
7768 
7769 // Return the number of characters it takes to hold field
7770 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7771  const char **ptr,
7772  kmp_str_buf_t *field_buffer) {
7773  int rc, format_index, field_value;
7774  const char *width_left, *width_right;
7775  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7776  static const int FORMAT_SIZE = 20;
7777  char format[FORMAT_SIZE] = {0};
7778  char absolute_short_name = 0;
7779 
7780  KMP_DEBUG_ASSERT(gtid >= 0);
7781  KMP_DEBUG_ASSERT(th);
7782  KMP_DEBUG_ASSERT(**ptr == '%');
7783  KMP_DEBUG_ASSERT(field_buffer);
7784 
7785  __kmp_str_buf_clear(field_buffer);
7786 
7787  // Skip the initial %
7788  (*ptr)++;
7789 
7790  // Check for %% first
7791  if (**ptr == '%') {
7792  __kmp_str_buf_cat(field_buffer, "%", 1);
7793  (*ptr)++; // skip over the second %
7794  return 1;
7795  }
7796 
7797  // Parse field modifiers if they are present
7798  pad_zeros = false;
7799  if (**ptr == '0') {
7800  pad_zeros = true;
7801  (*ptr)++; // skip over 0
7802  }
7803  right_justify = false;
7804  if (**ptr == '.') {
7805  right_justify = true;
7806  (*ptr)++; // skip over .
7807  }
7808  // Parse width of field: [width_left, width_right)
7809  width_left = width_right = NULL;
7810  if (**ptr >= '0' && **ptr <= '9') {
7811  width_left = *ptr;
7812  SKIP_DIGITS(*ptr);
7813  width_right = *ptr;
7814  }
7815 
7816  // Create the format for KMP_SNPRINTF based on flags parsed above
7817  format_index = 0;
7818  format[format_index++] = '%';
7819  if (!right_justify)
7820  format[format_index++] = '-';
7821  if (pad_zeros)
7822  format[format_index++] = '0';
7823  if (width_left && width_right) {
7824  int i = 0;
7825  // Only allow 8 digit number widths.
7826  // This also prevents overflowing format variable
7827  while (i < 8 && width_left < width_right) {
7828  format[format_index++] = *width_left;
7829  width_left++;
7830  i++;
7831  }
7832  }
7833 
7834  // Parse a name (long or short)
7835  // Canonicalize the name into absolute_short_name
7836  found_valid_name = false;
7837  parse_long_name = (**ptr == '{');
7838  if (parse_long_name)
7839  (*ptr)++; // skip initial left brace
7840  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7841  sizeof(__kmp_affinity_format_table[0]);
7842  ++i) {
7843  char short_name = __kmp_affinity_format_table[i].short_name;
7844  const char *long_name = __kmp_affinity_format_table[i].long_name;
7845  char field_format = __kmp_affinity_format_table[i].field_format;
7846  if (parse_long_name) {
7847  int length = KMP_STRLEN(long_name);
7848  if (strncmp(*ptr, long_name, length) == 0) {
7849  found_valid_name = true;
7850  (*ptr) += length; // skip the long name
7851  }
7852  } else if (**ptr == short_name) {
7853  found_valid_name = true;
7854  (*ptr)++; // skip the short name
7855  }
7856  if (found_valid_name) {
7857  format[format_index++] = field_format;
7858  format[format_index++] = '\0';
7859  absolute_short_name = short_name;
7860  break;
7861  }
7862  }
7863  if (parse_long_name) {
7864  if (**ptr != '}') {
7865  absolute_short_name = 0;
7866  } else {
7867  (*ptr)++; // skip over the right brace
7868  }
7869  }
7870 
7871  // Attempt to fill the buffer with the requested
7872  // value using snprintf within __kmp_str_buf_print()
7873  switch (absolute_short_name) {
7874  case 't':
7875  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7876  break;
7877  case 'T':
7878  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7879  break;
7880  case 'L':
7881  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7882  break;
7883  case 'n':
7884  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7885  break;
7886  case 'H': {
7887  static const int BUFFER_SIZE = 256;
7888  char buf[BUFFER_SIZE];
7889  __kmp_expand_host_name(buf, BUFFER_SIZE);
7890  rc = __kmp_str_buf_print(field_buffer, format, buf);
7891  } break;
7892  case 'P':
7893  rc = __kmp_str_buf_print(field_buffer, format, getpid());
7894  break;
7895  case 'i':
7896  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7897  break;
7898  case 'N':
7899  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7900  break;
7901  case 'a':
7902  field_value =
7903  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7904  rc = __kmp_str_buf_print(field_buffer, format, field_value);
7905  break;
7906 #if KMP_AFFINITY_SUPPORTED
7907  case 'A': {
7908  kmp_str_buf_t buf;
7909  __kmp_str_buf_init(&buf);
7910  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7911  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7912  __kmp_str_buf_free(&buf);
7913  } break;
7914 #endif
7915  default:
7916  // According to spec, If an implementation does not have info for field
7917  // type, then "undefined" is printed
7918  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7919  // Skip the field
7920  if (parse_long_name) {
7921  SKIP_TOKEN(*ptr);
7922  if (**ptr == '}')
7923  (*ptr)++;
7924  } else {
7925  (*ptr)++;
7926  }
7927  }
7928 
7929  KMP_ASSERT(format_index <= FORMAT_SIZE);
7930  return rc;
7931 }
7932 
7933 /*
7934  * Return number of characters needed to hold the affinity string
7935  * (not including null byte character)
7936  * The resultant string is printed to buffer, which the caller can then
7937  * handle afterwards
7938 */
7939 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7940  kmp_str_buf_t *buffer) {
7941  const char *parse_ptr;
7942  size_t retval;
7943  const kmp_info_t *th;
7944  kmp_str_buf_t field;
7945 
7946  KMP_DEBUG_ASSERT(buffer);
7947  KMP_DEBUG_ASSERT(gtid >= 0);
7948 
7949  __kmp_str_buf_init(&field);
7950  __kmp_str_buf_clear(buffer);
7951 
7952  th = __kmp_threads[gtid];
7953  retval = 0;
7954 
7955  // If format is NULL or zero-length string, then we use
7956  // affinity-format-var ICV
7957  parse_ptr = format;
7958  if (parse_ptr == NULL || *parse_ptr == '\0') {
7959  parse_ptr = __kmp_affinity_format;
7960  }
7961  KMP_DEBUG_ASSERT(parse_ptr);
7962 
7963  while (*parse_ptr != '\0') {
7964  // Parse a field
7965  if (*parse_ptr == '%') {
7966  // Put field in the buffer
7967  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7968  __kmp_str_buf_catbuf(buffer, &field);
7969  retval += rc;
7970  } else {
7971  // Put literal character in buffer
7972  __kmp_str_buf_cat(buffer, parse_ptr, 1);
7973  retval++;
7974  parse_ptr++;
7975  }
7976  }
7977  __kmp_str_buf_free(&field);
7978  return retval;
7979 }
7980 
7981 // Displays the affinity string to stdout
7982 void __kmp_aux_display_affinity(int gtid, const char *format) {
7983  kmp_str_buf_t buf;
7984  __kmp_str_buf_init(&buf);
7985  __kmp_aux_capture_affinity(gtid, format, &buf);
7986  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7987  __kmp_str_buf_free(&buf);
7988 }
7989 #endif // OMP_50_ENABLED
7990 
7991 /* ------------------------------------------------------------------------ */
7992 
7993 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7994  int blocktime = arg; /* argument is in milliseconds */
7995 #if KMP_USE_MONITOR
7996  int bt_intervals;
7997 #endif
7998  int bt_set;
7999 
8000  __kmp_save_internal_controls(thread);
8001 
8002  /* Normalize and set blocktime for the teams */
8003  if (blocktime < KMP_MIN_BLOCKTIME)
8004  blocktime = KMP_MIN_BLOCKTIME;
8005  else if (blocktime > KMP_MAX_BLOCKTIME)
8006  blocktime = KMP_MAX_BLOCKTIME;
8007 
8008  set__blocktime_team(thread->th.th_team, tid, blocktime);
8009  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8010 
8011 #if KMP_USE_MONITOR
8012  /* Calculate and set blocktime intervals for the teams */
8013  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8014 
8015  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8016  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8017 #endif
8018 
8019  /* Set whether blocktime has been set to "TRUE" */
8020  bt_set = TRUE;
8021 
8022  set__bt_set_team(thread->th.th_team, tid, bt_set);
8023  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8024 #if KMP_USE_MONITOR
8025  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8026  "bt_intervals=%d, monitor_updates=%d\n",
8027  __kmp_gtid_from_tid(tid, thread->th.th_team),
8028  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8029  __kmp_monitor_wakeups));
8030 #else
8031  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8032  __kmp_gtid_from_tid(tid, thread->th.th_team),
8033  thread->th.th_team->t.t_id, tid, blocktime));
8034 #endif
8035 }
8036 
8037 void __kmp_aux_set_defaults(char const *str, int len) {
8038  if (!__kmp_init_serial) {
8039  __kmp_serial_initialize();
8040  }
8041  __kmp_env_initialize(str);
8042 
8043  if (__kmp_settings
8044 #if OMP_40_ENABLED
8045  || __kmp_display_env || __kmp_display_env_verbose
8046 #endif // OMP_40_ENABLED
8047  ) {
8048  __kmp_env_print();
8049  }
8050 } // __kmp_aux_set_defaults
8051 
8052 /* ------------------------------------------------------------------------ */
8053 /* internal fast reduction routines */
8054 
8055 PACKED_REDUCTION_METHOD_T
8056 __kmp_determine_reduction_method(
8057  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8058  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8059  kmp_critical_name *lck) {
8060 
8061  // Default reduction method: critical construct ( lck != NULL, like in current
8062  // PAROPT )
8063  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8064  // can be selected by RTL
8065  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8066  // can be selected by RTL
8067  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8068  // among generated by PAROPT.
8069 
8070  PACKED_REDUCTION_METHOD_T retval;
8071 
8072  int team_size;
8073 
8074  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8075  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8076 
8077 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8078  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8079 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8080 
8081  retval = critical_reduce_block;
8082 
8083  // another choice of getting a team size (with 1 dynamic deference) is slower
8084  team_size = __kmp_get_team_num_threads(global_tid);
8085  if (team_size == 1) {
8086 
8087  retval = empty_reduce_block;
8088 
8089  } else {
8090 
8091  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8092 
8093 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8094 
8095 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8096  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8097 
8098  int teamsize_cutoff = 4;
8099 
8100 #if KMP_MIC_SUPPORTED
8101  if (__kmp_mic_type != non_mic) {
8102  teamsize_cutoff = 8;
8103  }
8104 #endif
8105  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8106  if (tree_available) {
8107  if (team_size <= teamsize_cutoff) {
8108  if (atomic_available) {
8109  retval = atomic_reduce_block;
8110  }
8111  } else {
8112  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8113  }
8114  } else if (atomic_available) {
8115  retval = atomic_reduce_block;
8116  }
8117 #else
8118 #error "Unknown or unsupported OS"
8119 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8120  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8121 
8122 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8123 
8124 #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD
8125 
8126  // basic tuning
8127 
8128  if (atomic_available) {
8129  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8130  retval = atomic_reduce_block;
8131  }
8132  } // otherwise: use critical section
8133 
8134 #elif KMP_OS_DARWIN
8135 
8136  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8137  if (atomic_available && (num_vars <= 3)) {
8138  retval = atomic_reduce_block;
8139  } else if (tree_available) {
8140  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8141  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8142  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8143  }
8144  } // otherwise: use critical section
8145 
8146 #else
8147 #error "Unknown or unsupported OS"
8148 #endif
8149 
8150 #else
8151 #error "Unknown or unsupported architecture"
8152 #endif
8153  }
8154 
8155  // KMP_FORCE_REDUCTION
8156 
8157  // If the team is serialized (team_size == 1), ignore the forced reduction
8158  // method and stay with the unsynchronized method (empty_reduce_block)
8159  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8160  team_size != 1) {
8161 
8162  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8163 
8164  int atomic_available, tree_available;
8165 
8166  switch ((forced_retval = __kmp_force_reduction_method)) {
8167  case critical_reduce_block:
8168  KMP_ASSERT(lck); // lck should be != 0
8169  break;
8170 
8171  case atomic_reduce_block:
8172  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8173  if (!atomic_available) {
8174  KMP_WARNING(RedMethodNotSupported, "atomic");
8175  forced_retval = critical_reduce_block;
8176  }
8177  break;
8178 
8179  case tree_reduce_block:
8180  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8181  if (!tree_available) {
8182  KMP_WARNING(RedMethodNotSupported, "tree");
8183  forced_retval = critical_reduce_block;
8184  } else {
8185 #if KMP_FAST_REDUCTION_BARRIER
8186  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8187 #endif
8188  }
8189  break;
8190 
8191  default:
8192  KMP_ASSERT(0); // "unsupported method specified"
8193  }
8194 
8195  retval = forced_retval;
8196  }
8197 
8198  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8199 
8200 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8201 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8202 
8203  return (retval);
8204 }
8205 
8206 // this function is for testing set/get/determine reduce method
8207 kmp_int32 __kmp_get_reduce_method(void) {
8208  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8209 }
8210 
8211 #if OMP_50_ENABLED
8212 
8213 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8214 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8215 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8216 
8217 // Hard pause shuts down the runtime completely. Resume happens naturally when
8218 // OpenMP is used subsequently.
8219 void __kmp_hard_pause() {
8220  __kmp_pause_status = kmp_hard_paused;
8221  __kmp_internal_end_thread(-1);
8222 }
8223 
8224 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8225 void __kmp_resume_if_soft_paused() {
8226  if (__kmp_pause_status == kmp_soft_paused) {
8227  __kmp_pause_status = kmp_not_paused;
8228 
8229  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8230  kmp_info_t *thread = __kmp_threads[gtid];
8231  if (thread) { // Wake it if sleeping
8232  kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8233  if (fl.is_sleeping())
8234  fl.resume(gtid);
8235  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8236  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8237  } else { // thread holds the lock and may sleep soon
8238  do { // until either the thread sleeps, or we can get the lock
8239  if (fl.is_sleeping()) {
8240  fl.resume(gtid);
8241  break;
8242  } else if (__kmp_try_suspend_mx(thread)) {
8243  __kmp_unlock_suspend_mx(thread);
8244  break;
8245  }
8246  } while (1);
8247  }
8248  }
8249  }
8250  }
8251 }
8252 
8253 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8254 // TODO: add warning messages
8255 int __kmp_pause_resource(kmp_pause_status_t level) {
8256  if (level == kmp_not_paused) { // requesting resume
8257  if (__kmp_pause_status == kmp_not_paused) {
8258  // error message about runtime not being paused, so can't resume
8259  return 1;
8260  } else {
8261  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8262  __kmp_pause_status == kmp_hard_paused);
8263  __kmp_pause_status = kmp_not_paused;
8264  return 0;
8265  }
8266  } else if (level == kmp_soft_paused) { // requesting soft pause
8267  if (__kmp_pause_status != kmp_not_paused) {
8268  // error message about already being paused
8269  return 1;
8270  } else {
8271  __kmp_soft_pause();
8272  return 0;
8273  }
8274  } else if (level == kmp_hard_paused) { // requesting hard pause
8275  if (__kmp_pause_status != kmp_not_paused) {
8276  // error message about already being paused
8277  return 1;
8278  } else {
8279  __kmp_hard_pause();
8280  return 0;
8281  }
8282  } else {
8283  // error message about invalid level
8284  return 1;
8285  }
8286 }
8287 
8288 #endif // OMP_50_ENABLED
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:877
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:919
sched_type
Definition: kmp.h:337
Definition: kmp.h:224
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
kmp_int32 flags
Definition: kmp.h:226