LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46  KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48 
49 char const __kmp_version_omp_api[] =
50  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
51 
52 #ifdef KMP_DEBUG
53 char const __kmp_version_lock[] =
54  KMP_VERSION_PREFIX "lock type: run time selectable";
55 #endif /* KMP_DEBUG */
56 
57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
58 
59 /* ------------------------------------------------------------------------ */
60 
61 #if KMP_USE_MONITOR
62 kmp_info_t __kmp_monitor;
63 #endif
64 
65 /* Forward declarations */
66 
67 void __kmp_cleanup(void);
68 
69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
70  int gtid);
71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72  kmp_internal_control_t *new_icvs,
73  ident_t *loc);
74 #if KMP_AFFINITY_SUPPORTED
75 static void __kmp_partition_places(kmp_team_t *team,
76  int update_master_only = 0);
77 #endif
78 static void __kmp_do_serial_initialize(void);
79 void __kmp_fork_barrier(int gtid, int tid);
80 void __kmp_join_barrier(int gtid);
81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82  kmp_internal_control_t *new_icvs, ident_t *loc);
83 
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
86 #endif
87 
88 static int __kmp_expand_threads(int nNeed);
89 #if KMP_OS_WINDOWS
90 static int __kmp_unregister_root_other_thread(int gtid);
91 #endif
92 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95 
96 /* Calculate the identifier of the current thread */
97 /* fast (and somewhat portable) way to get unique identifier of executing
98  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
99 int __kmp_get_global_thread_id() {
100  int i;
101  kmp_info_t **other_threads;
102  size_t stack_data;
103  char *stack_addr;
104  size_t stack_size;
105  char *stack_base;
106 
107  KA_TRACE(
108  1000,
109  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
110  __kmp_nth, __kmp_all_nth));
111 
112  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115  __kmp_init_gtid for this to work. */
116 
117  if (!TCR_4(__kmp_init_gtid))
118  return KMP_GTID_DNE;
119 
120 #ifdef KMP_TDATA_GTID
121  if (TCR_4(__kmp_gtid_mode) >= 3) {
122  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
123  return __kmp_gtid;
124  }
125 #endif
126  if (TCR_4(__kmp_gtid_mode) >= 2) {
127  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128  return __kmp_gtid_get_specific();
129  }
130  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
131 
132  stack_addr = (char *)&stack_data;
133  other_threads = __kmp_threads;
134 
135  /* ATT: The code below is a source of potential bugs due to unsynchronized
136  access to __kmp_threads array. For example:
137  1. Current thread loads other_threads[i] to thr and checks it, it is
138  non-NULL.
139  2. Current thread is suspended by OS.
140  3. Another thread unregisters and finishes (debug versions of free()
141  may fill memory with something like 0xEF).
142  4. Current thread is resumed.
143  5. Current thread reads junk from *thr.
144  TODO: Fix it. --ln */
145 
146  for (i = 0; i < __kmp_threads_capacity; i++) {
147 
148  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
149  if (!thr)
150  continue;
151 
152  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
154 
155  /* stack grows down -- search through all of the active threads */
156 
157  if (stack_addr <= stack_base) {
158  size_t stack_diff = stack_base - stack_addr;
159 
160  if (stack_diff <= stack_size) {
161  /* The only way we can be closer than the allocated */
162  /* stack size is if we are running on this thread. */
163  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
164  return i;
165  }
166  }
167  }
168 
169  /* get specific to try and determine our gtid */
170  KA_TRACE(1000,
171  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172  "thread, using TLS\n"));
173  i = __kmp_gtid_get_specific();
174 
175  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
176 
177  /* if we havn't been assigned a gtid, then return code */
178  if (i < 0)
179  return i;
180 
181  /* dynamically updated stack window for uber threads to avoid get_specific
182  call */
183  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184  KMP_FATAL(StackOverflow, i);
185  }
186 
187  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188  if (stack_addr > stack_base) {
189  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
192  stack_base);
193  } else {
194  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195  stack_base - stack_addr);
196  }
197 
198  /* Reprint stack bounds for ubermaster since they have been refined */
199  if (__kmp_storage_map) {
200  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203  other_threads[i]->th.th_info.ds.ds_stacksize,
204  "th_%d stack (refinement)", i);
205  }
206  return i;
207 }
208 
209 int __kmp_get_global_thread_id_reg() {
210  int gtid;
211 
212  if (!__kmp_init_serial) {
213  gtid = KMP_GTID_DNE;
214  } else
215 #ifdef KMP_TDATA_GTID
216  if (TCR_4(__kmp_gtid_mode) >= 3) {
217  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
218  gtid = __kmp_gtid;
219  } else
220 #endif
221  if (TCR_4(__kmp_gtid_mode) >= 2) {
222  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223  gtid = __kmp_gtid_get_specific();
224  } else {
225  KA_TRACE(1000,
226  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227  gtid = __kmp_get_global_thread_id();
228  }
229 
230  /* we must be a new uber master sibling thread */
231  if (gtid == KMP_GTID_DNE) {
232  KA_TRACE(10,
233  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234  "Registering a new gtid.\n"));
235  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236  if (!__kmp_init_serial) {
237  __kmp_do_serial_initialize();
238  gtid = __kmp_gtid_get_specific();
239  } else {
240  gtid = __kmp_register_root(FALSE);
241  }
242  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244  }
245 
246  KMP_DEBUG_ASSERT(gtid >= 0);
247 
248  return gtid;
249 }
250 
251 /* caller must hold forkjoin_lock */
252 void __kmp_check_stack_overlap(kmp_info_t *th) {
253  int f;
254  char *stack_beg = NULL;
255  char *stack_end = NULL;
256  int gtid;
257 
258  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259  if (__kmp_storage_map) {
260  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
262 
263  gtid = __kmp_gtid_from_thread(th);
264 
265  if (gtid == KMP_GTID_MONITOR) {
266  __kmp_print_storage_map_gtid(
267  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268  "th_%s stack (%s)", "mon",
269  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
270  } else {
271  __kmp_print_storage_map_gtid(
272  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273  "th_%d stack (%s)", gtid,
274  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275  }
276  }
277 
278  /* No point in checking ubermaster threads since they use refinement and
279  * cannot overlap */
280  gtid = __kmp_gtid_from_thread(th);
281  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
282  KA_TRACE(10,
283  ("__kmp_check_stack_overlap: performing extensive checking\n"));
284  if (stack_beg == NULL) {
285  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
287  }
288 
289  for (f = 0; f < __kmp_threads_capacity; f++) {
290  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
291 
292  if (f_th && f_th != th) {
293  char *other_stack_end =
294  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295  char *other_stack_beg =
296  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299 
300  /* Print the other stack values before the abort */
301  if (__kmp_storage_map)
302  __kmp_print_storage_map_gtid(
303  -1, other_stack_beg, other_stack_end,
304  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
306 
307  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
308  __kmp_msg_null);
309  }
310  }
311  }
312  }
313  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
314 }
315 
316 /* ------------------------------------------------------------------------ */
317 
318 void __kmp_infinite_loop(void) {
319  static int done = FALSE;
320 
321  while (!done) {
322  KMP_YIELD(TRUE);
323  }
324 }
325 
326 #define MAX_MESSAGE 512
327 
328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329  char const *format, ...) {
330  char buffer[MAX_MESSAGE];
331  va_list ap;
332 
333  va_start(ap, format);
334  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335  p2, (unsigned long)size, format);
336  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337  __kmp_vprintf(kmp_err, buffer, ap);
338 #if KMP_PRINT_DATA_PLACEMENT
339  int node;
340  if (gtid >= 0) {
341  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342  if (__kmp_storage_map_verbose) {
343  node = __kmp_get_host_node(p1);
344  if (node < 0) /* doesn't work, so don't try this next time */
345  __kmp_storage_map_verbose = FALSE;
346  else {
347  char *last;
348  int lastNode;
349  int localProc = __kmp_get_cpu_from_gtid(gtid);
350 
351  const int page_size = KMP_GET_PAGE_SIZE();
352 
353  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
355  if (localProc >= 0)
356  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
357  localProc >> 1);
358  else
359  __kmp_printf_no_lock(" GTID %d\n", gtid);
360 #if KMP_USE_PRCTL
361  /* The more elaborate format is disabled for now because of the prctl
362  * hanging bug. */
363  do {
364  last = p1;
365  lastNode = node;
366  /* This loop collates adjacent pages with the same host node. */
367  do {
368  (char *)p1 += page_size;
369  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
371  lastNode);
372  } while (p1 <= p2);
373 #else
374  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
375  (char *)p1 + (page_size - 1),
376  __kmp_get_host_node(p1));
377  if (p1 < p2) {
378  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
379  (char *)p2 + (page_size - 1),
380  __kmp_get_host_node(p2));
381  }
382 #endif
383  }
384  }
385  } else
386  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
387  }
388 #endif /* KMP_PRINT_DATA_PLACEMENT */
389  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
390 }
391 
392 void __kmp_warn(char const *format, ...) {
393  char buffer[MAX_MESSAGE];
394  va_list ap;
395 
396  if (__kmp_generate_warnings == kmp_warnings_off) {
397  return;
398  }
399 
400  va_start(ap, format);
401 
402  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404  __kmp_vprintf(kmp_err, buffer, ap);
405  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406 
407  va_end(ap);
408 }
409 
410 void __kmp_abort_process() {
411  // Later threads may stall here, but that's ok because abort() will kill them.
412  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
413 
414  if (__kmp_debug_buf) {
415  __kmp_dump_debug_buffer();
416  }
417 
418  if (KMP_OS_WINDOWS) {
419  // Let other threads know of abnormal termination and prevent deadlock
420  // if abort happened during library initialization or shutdown
421  __kmp_global.g.g_abort = SIGABRT;
422 
423  /* On Windows* OS by default abort() causes pop-up error box, which stalls
424  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425  boxes. _set_abort_behavior() works well, but this function is not
426  available in VS7 (this is not problem for DLL, but it is a problem for
427  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428  help, at least in some versions of MS C RTL.
429 
430  It seems following sequence is the only way to simulate abort() and
431  avoid pop-up error box. */
432  raise(SIGABRT);
433  _exit(3); // Just in case, if signal ignored, exit anyway.
434  } else {
435  abort();
436  }
437 
438  __kmp_infinite_loop();
439  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
440 
441 } // __kmp_abort_process
442 
443 void __kmp_abort_thread(void) {
444  // TODO: Eliminate g_abort global variable and this function.
445  // In case of abort just call abort(), it will kill all the threads.
446  __kmp_infinite_loop();
447 } // __kmp_abort_thread
448 
449 /* Print out the storage map for the major kmp_info_t thread data structures
450  that are allocated together. */
451 
452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
454  gtid);
455 
456  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
458 
459  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460  sizeof(kmp_local_t), "th_%d.th_local", gtid);
461 
462  __kmp_print_storage_map_gtid(
463  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
465 
466  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467  &thr->th.th_bar[bs_plain_barrier + 1],
468  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
469  gtid);
470 
471  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472  &thr->th.th_bar[bs_forkjoin_barrier + 1],
473  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
474  gtid);
475 
476 #if KMP_FAST_REDUCTION_BARRIER
477  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478  &thr->th.th_bar[bs_reduction_barrier + 1],
479  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
480  gtid);
481 #endif // KMP_FAST_REDUCTION_BARRIER
482 }
483 
484 /* Print out the storage map for the major kmp_team_t team data structures
485  that are allocated together. */
486 
487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488  int team_id, int num_thr) {
489  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
491  header, team_id);
492 
493  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494  &team->t.t_bar[bs_last_barrier],
495  sizeof(kmp_balign_team_t) * bs_last_barrier,
496  "%s_%d.t_bar", header, team_id);
497 
498  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499  &team->t.t_bar[bs_plain_barrier + 1],
500  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
501  header, team_id);
502 
503  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504  &team->t.t_bar[bs_forkjoin_barrier + 1],
505  sizeof(kmp_balign_team_t),
506  "%s_%d.t_bar[forkjoin]", header, team_id);
507 
508 #if KMP_FAST_REDUCTION_BARRIER
509  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510  &team->t.t_bar[bs_reduction_barrier + 1],
511  sizeof(kmp_balign_team_t),
512  "%s_%d.t_bar[reduction]", header, team_id);
513 #endif // KMP_FAST_REDUCTION_BARRIER
514 
515  __kmp_print_storage_map_gtid(
516  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
518 
519  __kmp_print_storage_map_gtid(
520  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
522 
523  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524  &team->t.t_disp_buffer[num_disp_buff],
525  sizeof(dispatch_shared_info_t) * num_disp_buff,
526  "%s_%d.t_disp_buffer", header, team_id);
527 }
528 
529 static void __kmp_init_allocator() { __kmp_init_memkind(); }
530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
531 
532 /* ------------------------------------------------------------------------ */
533 
534 #if KMP_DYNAMIC_LIB
535 #if KMP_OS_WINDOWS
536 
537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538  // TODO: Change to __kmp_break_bootstrap_lock().
539  __kmp_init_bootstrap_lock(lck); // make the lock released
540 }
541 
542 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
543  int i;
544  int thread_count;
545 
546  // PROCESS_DETACH is expected to be called by a thread that executes
547  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550  // threads can be still alive here, although being about to be terminated. The
551  // threads in the array with ds_thread==0 are most suspicious. Actually, it
552  // can be not safe to access the __kmp_threads[].
553 
554  // TODO: does it make sense to check __kmp_roots[] ?
555 
556  // Let's check that there are no other alive threads registered with the OMP
557  // lib.
558  while (1) {
559  thread_count = 0;
560  for (i = 0; i < __kmp_threads_capacity; ++i) {
561  if (!__kmp_threads)
562  continue;
563  kmp_info_t *th = __kmp_threads[i];
564  if (th == NULL)
565  continue;
566  int gtid = th->th.th_info.ds.ds_gtid;
567  if (gtid == gtid_req)
568  continue;
569  if (gtid < 0)
570  continue;
571  DWORD exit_val;
572  int alive = __kmp_is_thread_alive(th, &exit_val);
573  if (alive) {
574  ++thread_count;
575  }
576  }
577  if (thread_count == 0)
578  break; // success
579  }
580 
581  // Assume that I'm alone. Now it might be safe to check and reset locks.
582  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583  __kmp_reset_lock(&__kmp_forkjoin_lock);
584 #ifdef KMP_DEBUG
585  __kmp_reset_lock(&__kmp_stdio_lock);
586 #endif // KMP_DEBUG
587 }
588 
589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
591 
592  switch (fdwReason) {
593 
594  case DLL_PROCESS_ATTACH:
595  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
596 
597  return TRUE;
598 
599  case DLL_PROCESS_DETACH:
600  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
601 
602  if (lpReserved != NULL) {
603  // lpReserved is used for telling the difference:
604  // lpReserved == NULL when FreeLibrary() was called,
605  // lpReserved != NULL when the process terminates.
606  // When FreeLibrary() is called, worker threads remain alive. So they will
607  // release the forkjoin lock by themselves. When the process terminates,
608  // worker threads disappear triggering the problem of unreleased forkjoin
609  // lock as described below.
610 
611  // A worker thread can take the forkjoin lock. The problem comes up if
612  // that worker thread becomes dead before it releases the forkjoin lock.
613  // The forkjoin lock remains taken, while the thread executing
614  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615  // to take the forkjoin lock and will always fail, so that the application
616  // will never finish [normally]. This scenario is possible if
617  // __kmpc_end() has not been executed. It looks like it's not a corner
618  // case, but common cases:
619  // - the main function was compiled by an alternative compiler;
620  // - the main function was compiled by icl but without /Qopenmp
621  // (application with plugins);
622  // - application terminates by calling C exit(), Fortran CALL EXIT() or
623  // Fortran STOP.
624  // - alive foreign thread prevented __kmpc_end from doing cleanup.
625  //
626  // This is a hack to work around the problem.
627  // TODO: !!! figure out something better.
628  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
629  }
630 
631  __kmp_internal_end_library(__kmp_gtid_get_specific());
632 
633  return TRUE;
634 
635  case DLL_THREAD_ATTACH:
636  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
637 
638  /* if we want to register new siblings all the time here call
639  * __kmp_get_gtid(); */
640  return TRUE;
641 
642  case DLL_THREAD_DETACH:
643  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
644 
645  __kmp_internal_end_thread(__kmp_gtid_get_specific());
646  return TRUE;
647  }
648 
649  return TRUE;
650 }
651 
652 #endif /* KMP_OS_WINDOWS */
653 #endif /* KMP_DYNAMIC_LIB */
654 
655 /* __kmp_parallel_deo -- Wait until it's our turn. */
656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657  int gtid = *gtid_ref;
658 #ifdef BUILD_PARALLEL_ORDERED
659  kmp_team_t *team = __kmp_team_from_gtid(gtid);
660 #endif /* BUILD_PARALLEL_ORDERED */
661 
662  if (__kmp_env_consistency_check) {
663  if (__kmp_threads[gtid]->th.th_root->r.r_active)
664 #if KMP_USE_DYNAMIC_LOCK
665  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
666 #else
667  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
668 #endif
669  }
670 #ifdef BUILD_PARALLEL_ORDERED
671  if (!team->t.t_serialized) {
672  KMP_MB();
673  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
674  NULL);
675  KMP_MB();
676  }
677 #endif /* BUILD_PARALLEL_ORDERED */
678 }
679 
680 /* __kmp_parallel_dxo -- Signal the next task. */
681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682  int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684  int tid = __kmp_tid_from_gtid(gtid);
685  kmp_team_t *team = __kmp_team_from_gtid(gtid);
686 #endif /* BUILD_PARALLEL_ORDERED */
687 
688  if (__kmp_env_consistency_check) {
689  if (__kmp_threads[gtid]->th.th_root->r.r_active)
690  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
691  }
692 #ifdef BUILD_PARALLEL_ORDERED
693  if (!team->t.t_serialized) {
694  KMP_MB(); /* Flush all pending memory write invalidates. */
695 
696  /* use the tid of the next thread in this team */
697  /* TODO replace with general release procedure */
698  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
699 
700  KMP_MB(); /* Flush all pending memory write invalidates. */
701  }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* ------------------------------------------------------------------------ */
706 /* The BARRIER for a SINGLE process section is always explicit */
707 
708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
709  int status;
710  kmp_info_t *th;
711  kmp_team_t *team;
712 
713  if (!TCR_4(__kmp_init_parallel))
714  __kmp_parallel_initialize();
715  __kmp_resume_if_soft_paused();
716 
717  th = __kmp_threads[gtid];
718  team = th->th.th_team;
719  status = 0;
720 
721  th->th.th_ident = id_ref;
722 
723  if (team->t.t_serialized) {
724  status = 1;
725  } else {
726  kmp_int32 old_this = th->th.th_local.this_construct;
727 
728  ++th->th.th_local.this_construct;
729  /* try to set team count to thread count--success means thread got the
730  single block */
731  /* TODO: Should this be acquire or release? */
732  if (team->t.t_construct == old_this) {
733  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734  th->th.th_local.this_construct);
735  }
736 #if USE_ITT_BUILD
737  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739  team->t.t_active_level ==
740  1) { // Only report metadata by master of active team at level 1
741  __kmp_itt_metadata_single(id_ref);
742  }
743 #endif /* USE_ITT_BUILD */
744  }
745 
746  if (__kmp_env_consistency_check) {
747  if (status && push_ws) {
748  __kmp_push_workshare(gtid, ct_psingle, id_ref);
749  } else {
750  __kmp_check_workshare(gtid, ct_psingle, id_ref);
751  }
752  }
753 #if USE_ITT_BUILD
754  if (status) {
755  __kmp_itt_single_start(gtid);
756  }
757 #endif /* USE_ITT_BUILD */
758  return status;
759 }
760 
761 void __kmp_exit_single(int gtid) {
762 #if USE_ITT_BUILD
763  __kmp_itt_single_end(gtid);
764 #endif /* USE_ITT_BUILD */
765  if (__kmp_env_consistency_check)
766  __kmp_pop_workshare(gtid, ct_psingle, NULL);
767 }
768 
769 /* determine if we can go parallel or must use a serialized parallel region and
770  * how many threads we can use
771  * set_nproc is the number of threads requested for the team
772  * returns 0 if we should serialize or only use one thread,
773  * otherwise the number of threads to use
774  * The forkjoin lock is held by the caller. */
775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776  int master_tid, int set_nthreads,
777  int enter_teams) {
778  int capacity;
779  int new_nthreads;
780  KMP_DEBUG_ASSERT(__kmp_init_serial);
781  KMP_DEBUG_ASSERT(root && parent_team);
782  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
783 
784  // If dyn-var is set, dynamically adjust the number of desired threads,
785  // according to the method specified by dynamic_mode.
786  new_nthreads = set_nthreads;
787  if (!get__dynamic_2(parent_team, master_tid)) {
788  ;
789  }
790 #ifdef USE_LOAD_BALANCE
791  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793  if (new_nthreads == 1) {
794  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795  "reservation to 1 thread\n",
796  master_tid));
797  return 1;
798  }
799  if (new_nthreads < set_nthreads) {
800  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801  "reservation to %d threads\n",
802  master_tid, new_nthreads));
803  }
804  }
805 #endif /* USE_LOAD_BALANCE */
806  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807  new_nthreads = __kmp_avail_proc - __kmp_nth +
808  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809  if (new_nthreads <= 1) {
810  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811  "reservation to 1 thread\n",
812  master_tid));
813  return 1;
814  }
815  if (new_nthreads < set_nthreads) {
816  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817  "reservation to %d threads\n",
818  master_tid, new_nthreads));
819  } else {
820  new_nthreads = set_nthreads;
821  }
822  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823  if (set_nthreads > 2) {
824  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825  new_nthreads = (new_nthreads % set_nthreads) + 1;
826  if (new_nthreads == 1) {
827  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828  "reservation to 1 thread\n",
829  master_tid));
830  return 1;
831  }
832  if (new_nthreads < set_nthreads) {
833  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834  "reservation to %d threads\n",
835  master_tid, new_nthreads));
836  }
837  }
838  } else {
839  KMP_ASSERT(0);
840  }
841 
842  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843  if (__kmp_nth + new_nthreads -
844  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
845  __kmp_max_nth) {
846  int tl_nthreads = __kmp_max_nth - __kmp_nth +
847  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848  if (tl_nthreads <= 0) {
849  tl_nthreads = 1;
850  }
851 
852  // If dyn-var is false, emit a 1-time warning.
853  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854  __kmp_reserve_warn = 1;
855  __kmp_msg(kmp_ms_warning,
856  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
858  }
859  if (tl_nthreads == 1) {
860  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861  "reduced reservation to 1 thread\n",
862  master_tid));
863  return 1;
864  }
865  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866  "reservation to %d threads\n",
867  master_tid, tl_nthreads));
868  new_nthreads = tl_nthreads;
869  }
870 
871  // Respect OMP_THREAD_LIMIT
872  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874  if (cg_nthreads + new_nthreads -
875  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
876  max_cg_threads) {
877  int tl_nthreads = max_cg_threads - cg_nthreads +
878  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879  if (tl_nthreads <= 0) {
880  tl_nthreads = 1;
881  }
882 
883  // If dyn-var is false, emit a 1-time warning.
884  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885  __kmp_reserve_warn = 1;
886  __kmp_msg(kmp_ms_warning,
887  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
889  }
890  if (tl_nthreads == 1) {
891  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892  "reduced reservation to 1 thread\n",
893  master_tid));
894  return 1;
895  }
896  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897  "reservation to %d threads\n",
898  master_tid, tl_nthreads));
899  new_nthreads = tl_nthreads;
900  }
901 
902  // Check if the threads array is large enough, or needs expanding.
903  // See comment in __kmp_register_root() about the adjustment if
904  // __kmp_threads[0] == NULL.
905  capacity = __kmp_threads_capacity;
906  if (TCR_PTR(__kmp_threads[0]) == NULL) {
907  --capacity;
908  }
909  if (__kmp_nth + new_nthreads -
910  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911  capacity) {
912  // Expand the threads array.
913  int slotsRequired = __kmp_nth + new_nthreads -
914  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915  capacity;
916  int slotsAdded = __kmp_expand_threads(slotsRequired);
917  if (slotsAdded < slotsRequired) {
918  // The threads array was not expanded enough.
919  new_nthreads -= (slotsRequired - slotsAdded);
920  KMP_ASSERT(new_nthreads >= 1);
921 
922  // If dyn-var is false, emit a 1-time warning.
923  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924  __kmp_reserve_warn = 1;
925  if (__kmp_tp_cached) {
926  __kmp_msg(kmp_ms_warning,
927  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930  } else {
931  __kmp_msg(kmp_ms_warning,
932  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934  }
935  }
936  }
937  }
938 
939 #ifdef KMP_DEBUG
940  if (new_nthreads == 1) {
941  KC_TRACE(10,
942  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943  "dead roots and rechecking; requested %d threads\n",
944  __kmp_get_gtid(), set_nthreads));
945  } else {
946  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947  " %d threads\n",
948  __kmp_get_gtid(), new_nthreads, set_nthreads));
949  }
950 #endif // KMP_DEBUG
951  return new_nthreads;
952 }
953 
954 /* Allocate threads from the thread pool and assign them to the new team. We are
955  assured that there are enough threads available, because we checked on that
956  earlier within critical section forkjoin */
957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958  kmp_info_t *master_th, int master_gtid) {
959  int i;
960  int use_hot_team;
961 
962  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
964  KMP_MB();
965 
966  /* first, let's setup the master thread */
967  master_th->th.th_info.ds.ds_tid = 0;
968  master_th->th.th_team = team;
969  master_th->th.th_team_nproc = team->t.t_nproc;
970  master_th->th.th_team_master = master_th;
971  master_th->th.th_team_serialized = FALSE;
972  master_th->th.th_dispatch = &team->t.t_dispatch[0];
973 
974 /* make sure we are not the optimized hot team */
975 #if KMP_NESTED_HOT_TEAMS
976  use_hot_team = 0;
977  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978  if (hot_teams) { // hot teams array is not allocated if
979  // KMP_HOT_TEAMS_MAX_LEVEL=0
980  int level = team->t.t_active_level - 1; // index in array of hot teams
981  if (master_th->th.th_teams_microtask) { // are we inside the teams?
982  if (master_th->th.th_teams_size.nteams > 1) {
983  ++level; // level was not increased in teams construct for
984  // team_of_masters
985  }
986  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987  master_th->th.th_teams_level == team->t.t_level) {
988  ++level; // level was not increased in teams construct for
989  // team_of_workers before the parallel
990  } // team->t.t_level will be increased inside parallel
991  }
992  if (level < __kmp_hot_teams_max_level) {
993  if (hot_teams[level].hot_team) {
994  // hot team has already been allocated for given level
995  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996  use_hot_team = 1; // the team is ready to use
997  } else {
998  use_hot_team = 0; // AC: threads are not allocated yet
999  hot_teams[level].hot_team = team; // remember new hot team
1000  hot_teams[level].hot_team_nth = team->t.t_nproc;
1001  }
1002  } else {
1003  use_hot_team = 0;
1004  }
1005  }
1006 #else
1007  use_hot_team = team == root->r.r_hot_team;
1008 #endif
1009  if (!use_hot_team) {
1010 
1011  /* install the master thread */
1012  team->t.t_threads[0] = master_th;
1013  __kmp_initialize_info(master_th, team, 0, master_gtid);
1014 
1015  /* now, install the worker threads */
1016  for (i = 1; i < team->t.t_nproc; i++) {
1017 
1018  /* fork or reallocate a new thread and install it in team */
1019  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020  team->t.t_threads[i] = thr;
1021  KMP_DEBUG_ASSERT(thr);
1022  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023  /* align team and thread arrived states */
1024  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029  team->t.t_bar[bs_plain_barrier].b_arrived));
1030  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031  thr->th.th_teams_level = master_th->th.th_teams_level;
1032  thr->th.th_teams_size = master_th->th.th_teams_size;
1033  { // Initialize threads' barrier data.
1034  int b;
1035  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036  for (b = 0; b < bs_last_barrier; ++b) {
1037  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1039 #if USE_DEBUGGER
1040  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1041 #endif
1042  }
1043  }
1044  }
1045 
1046 #if KMP_AFFINITY_SUPPORTED
1047  __kmp_partition_places(team);
1048 #endif
1049  }
1050 
1051  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052  for (i = 0; i < team->t.t_nproc; i++) {
1053  kmp_info_t *thr = team->t.t_threads[i];
1054  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055  thr->th.th_prev_level != team->t.t_level) {
1056  team->t.t_display_affinity = 1;
1057  break;
1058  }
1059  }
1060  }
1061 
1062  KMP_MB();
1063 }
1064 
1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066 // Propagate any changes to the floating point control registers out to the team
1067 // We try to avoid unnecessary writes to the relevant cache line in the team
1068 // structure, so we don't make changes unless they are needed.
1069 inline static void propagateFPControl(kmp_team_t *team) {
1070  if (__kmp_inherit_fp_control) {
1071  kmp_int16 x87_fpu_control_word;
1072  kmp_uint32 mxcsr;
1073 
1074  // Get master values of FPU control flags (both X87 and vector)
1075  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076  __kmp_store_mxcsr(&mxcsr);
1077  mxcsr &= KMP_X86_MXCSR_MASK;
1078 
1079  // There is no point looking at t_fp_control_saved here.
1080  // If it is TRUE, we still have to update the values if they are different
1081  // from those we now have. If it is FALSE we didn't save anything yet, but
1082  // our objective is the same. We have to ensure that the values in the team
1083  // are the same as those we have.
1084  // So, this code achieves what we need whether or not t_fp_control_saved is
1085  // true. By checking whether the value needs updating we avoid unnecessary
1086  // writes that would put the cache-line into a written state, causing all
1087  // threads in the team to have to read it again.
1088  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090  // Although we don't use this value, other code in the runtime wants to know
1091  // whether it should restore them. So we must ensure it is correct.
1092  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1093  } else {
1094  // Similarly here. Don't write to this cache-line in the team structure
1095  // unless we have to.
1096  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1097  }
1098 }
1099 
1100 // Do the opposite, setting the hardware registers to the updated values from
1101 // the team.
1102 inline static void updateHWFPControl(kmp_team_t *team) {
1103  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104  // Only reset the fp control regs if they have been changed in the team.
1105  // the parallel region that we are exiting.
1106  kmp_int16 x87_fpu_control_word;
1107  kmp_uint32 mxcsr;
1108  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109  __kmp_store_mxcsr(&mxcsr);
1110  mxcsr &= KMP_X86_MXCSR_MASK;
1111 
1112  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113  __kmp_clear_x87_fpu_status_word();
1114  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1115  }
1116 
1117  if (team->t.t_mxcsr != mxcsr) {
1118  __kmp_load_mxcsr(&team->t.t_mxcsr);
1119  }
1120  }
1121 }
1122 #else
1123 #define propagateFPControl(x) ((void)0)
1124 #define updateHWFPControl(x) ((void)0)
1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1126 
1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128  int realloc); // forward declaration
1129 
1130 /* Run a parallel region that has been serialized, so runs only in a team of the
1131  single master thread. */
1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133  kmp_info_t *this_thr;
1134  kmp_team_t *serial_team;
1135 
1136  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1137 
1138  /* Skip all this code for autopar serialized loops since it results in
1139  unacceptable overhead */
1140  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1141  return;
1142 
1143  if (!TCR_4(__kmp_init_parallel))
1144  __kmp_parallel_initialize();
1145  __kmp_resume_if_soft_paused();
1146 
1147  this_thr = __kmp_threads[global_tid];
1148  serial_team = this_thr->th.th_serial_team;
1149 
1150  /* utilize the serialized team held by this thread */
1151  KMP_DEBUG_ASSERT(serial_team);
1152  KMP_MB();
1153 
1154  if (__kmp_tasking_mode != tskm_immediate_exec) {
1155  KMP_DEBUG_ASSERT(
1156  this_thr->th.th_task_team ==
1157  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1159  NULL);
1160  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161  "team %p, new task_team = NULL\n",
1162  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163  this_thr->th.th_task_team = NULL;
1164  }
1165 
1166  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168  proc_bind = proc_bind_false;
1169  } else if (proc_bind == proc_bind_default) {
1170  // No proc_bind clause was specified, so use the current value
1171  // of proc-bind-var for this parallel region.
1172  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1173  }
1174  // Reset for next parallel region
1175  this_thr->th.th_set_proc_bind = proc_bind_default;
1176 
1177 #if OMPT_SUPPORT
1178  ompt_data_t ompt_parallel_data = ompt_data_none;
1179  ompt_data_t *implicit_task_data;
1180  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181  if (ompt_enabled.enabled &&
1182  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1183 
1184  ompt_task_info_t *parent_task_info;
1185  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1186 
1187  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188  if (ompt_enabled.ompt_callback_parallel_begin) {
1189  int team_size = 1;
1190 
1191  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192  &(parent_task_info->task_data), &(parent_task_info->frame),
1193  &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1194  codeptr);
1195  }
1196  }
1197 #endif // OMPT_SUPPORT
1198 
1199  if (this_thr->th.th_team != serial_team) {
1200  // Nested level will be an index in the nested nthreads array
1201  int level = this_thr->th.th_team->t.t_level;
1202 
1203  if (serial_team->t.t_serialized) {
1204  /* this serial team was already used
1205  TODO increase performance by making this locks more specific */
1206  kmp_team_t *new_team;
1207 
1208  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1209 
1210  new_team =
1211  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1212 #if OMPT_SUPPORT
1213  ompt_parallel_data,
1214 #endif
1215  proc_bind, &this_thr->th.th_current_task->td_icvs,
1216  0 USE_NESTED_HOT_ARG(NULL));
1217  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218  KMP_ASSERT(new_team);
1219 
1220  /* setup new serialized team and install it */
1221  new_team->t.t_threads[0] = this_thr;
1222  new_team->t.t_parent = this_thr->th.th_team;
1223  serial_team = new_team;
1224  this_thr->th.th_serial_team = serial_team;
1225 
1226  KF_TRACE(
1227  10,
1228  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229  global_tid, serial_team));
1230 
1231  /* TODO the above breaks the requirement that if we run out of resources,
1232  then we can still guarantee that serialized teams are ok, since we may
1233  need to allocate a new one */
1234  } else {
1235  KF_TRACE(
1236  10,
1237  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238  global_tid, serial_team));
1239  }
1240 
1241  /* we have to initialize this serial team */
1242  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245  serial_team->t.t_ident = loc;
1246  serial_team->t.t_serialized = 1;
1247  serial_team->t.t_nproc = 1;
1248  serial_team->t.t_parent = this_thr->th.th_team;
1249  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250  this_thr->th.th_team = serial_team;
1251  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1252 
1253  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254  this_thr->th.th_current_task));
1255  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256  this_thr->th.th_current_task->td_flags.executing = 0;
1257 
1258  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1259 
1260  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261  implicit task for each serialized task represented by
1262  team->t.t_serialized? */
1263  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264  &this_thr->th.th_current_task->td_parent->td_icvs);
1265 
1266  // Thread value exists in the nested nthreads array for the next nested
1267  // level
1268  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269  this_thr->th.th_current_task->td_icvs.nproc =
1270  __kmp_nested_nth.nth[level + 1];
1271  }
1272 
1273  if (__kmp_nested_proc_bind.used &&
1274  (level + 1 < __kmp_nested_proc_bind.used)) {
1275  this_thr->th.th_current_task->td_icvs.proc_bind =
1276  __kmp_nested_proc_bind.bind_types[level + 1];
1277  }
1278 
1279 #if USE_DEBUGGER
1280  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1281 #endif
1282  this_thr->th.th_info.ds.ds_tid = 0;
1283 
1284  /* set thread cache values */
1285  this_thr->th.th_team_nproc = 1;
1286  this_thr->th.th_team_master = this_thr;
1287  this_thr->th.th_team_serialized = 1;
1288 
1289  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1292 
1293  propagateFPControl(serial_team);
1294 
1295  /* check if we need to allocate dispatch buffers stack */
1296  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298  serial_team->t.t_dispatch->th_disp_buffer =
1299  (dispatch_private_info_t *)__kmp_allocate(
1300  sizeof(dispatch_private_info_t));
1301  }
1302  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1303 
1304  KMP_MB();
1305 
1306  } else {
1307  /* this serialized team is already being used,
1308  * that's fine, just add another nested level */
1309  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312  ++serial_team->t.t_serialized;
1313  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1314 
1315  // Nested level will be an index in the nested nthreads array
1316  int level = this_thr->th.th_team->t.t_level;
1317  // Thread value exists in the nested nthreads array for the next nested
1318  // level
1319  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320  this_thr->th.th_current_task->td_icvs.nproc =
1321  __kmp_nested_nth.nth[level + 1];
1322  }
1323  serial_team->t.t_level++;
1324  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325  "of serial team %p to %d\n",
1326  global_tid, serial_team, serial_team->t.t_level));
1327 
1328  /* allocate/push dispatch buffers stack */
1329  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1330  {
1331  dispatch_private_info_t *disp_buffer =
1332  (dispatch_private_info_t *)__kmp_allocate(
1333  sizeof(dispatch_private_info_t));
1334  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1336  }
1337  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1338 
1339  KMP_MB();
1340  }
1341  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1342 
1343  // Perform the display affinity functionality for
1344  // serialized parallel regions
1345  if (__kmp_display_affinity) {
1346  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347  this_thr->th.th_prev_num_threads != 1) {
1348  // NULL means use the affinity-format-var ICV
1349  __kmp_aux_display_affinity(global_tid, NULL);
1350  this_thr->th.th_prev_level = serial_team->t.t_level;
1351  this_thr->th.th_prev_num_threads = 1;
1352  }
1353  }
1354 
1355  if (__kmp_env_consistency_check)
1356  __kmp_push_parallel(global_tid, NULL);
1357 #if OMPT_SUPPORT
1358  serial_team->t.ompt_team_info.master_return_address = codeptr;
1359  if (ompt_enabled.enabled &&
1360  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1362 
1363  ompt_lw_taskteam_t lw_taskteam;
1364  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365  &ompt_parallel_data, codeptr);
1366 
1367  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368  // don't use lw_taskteam after linking. content was swaped
1369 
1370  /* OMPT implicit task begin */
1371  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372  if (ompt_enabled.ompt_callback_implicit_task) {
1373  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376  OMPT_CUR_TASK_INFO(this_thr)
1377  ->thread_num = __kmp_tid_from_gtid(global_tid);
1378  }
1379 
1380  /* OMPT state */
1381  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1383  }
1384 #endif
1385 }
1386 
1387 /* most of the work for a fork */
1388 /* return true if we really went parallel, false if serialized */
1389 int __kmp_fork_call(ident_t *loc, int gtid,
1390  enum fork_context_e call_context, // Intel, GNU, ...
1391  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1392 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1393 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1394  va_list *ap
1395 #else
1396  va_list ap
1397 #endif
1398  ) {
1399  void **argv;
1400  int i;
1401  int master_tid;
1402  int master_this_cons;
1403  kmp_team_t *team;
1404  kmp_team_t *parent_team;
1405  kmp_info_t *master_th;
1406  kmp_root_t *root;
1407  int nthreads;
1408  int master_active;
1409  int master_set_numthreads;
1410  int level;
1411  int active_level;
1412  int teams_level;
1413 #if KMP_NESTED_HOT_TEAMS
1414  kmp_hot_team_ptr_t **p_hot_teams;
1415 #endif
1416  { // KMP_TIME_BLOCK
1417  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1418  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1419 
1420  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1421  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1422  /* Some systems prefer the stack for the root thread(s) to start with */
1423  /* some gap from the parent stack to prevent false sharing. */
1424  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1425  /* These 2 lines below are so this does not get optimized out */
1426  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1427  __kmp_stkpadding += (short)((kmp_int64)dummy);
1428  }
1429 
1430  /* initialize if needed */
1431  KMP_DEBUG_ASSERT(
1432  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1433  if (!TCR_4(__kmp_init_parallel))
1434  __kmp_parallel_initialize();
1435  __kmp_resume_if_soft_paused();
1436 
1437  /* setup current data */
1438  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1439  // shutdown
1440  parent_team = master_th->th.th_team;
1441  master_tid = master_th->th.th_info.ds.ds_tid;
1442  master_this_cons = master_th->th.th_local.this_construct;
1443  root = master_th->th.th_root;
1444  master_active = root->r.r_active;
1445  master_set_numthreads = master_th->th.th_set_nproc;
1446 
1447 #if OMPT_SUPPORT
1448  ompt_data_t ompt_parallel_data = ompt_data_none;
1449  ompt_data_t *parent_task_data;
1450  ompt_frame_t *ompt_frame;
1451  ompt_data_t *implicit_task_data;
1452  void *return_address = NULL;
1453 
1454  if (ompt_enabled.enabled) {
1455  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1456  NULL, NULL);
1457  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1458  }
1459 #endif
1460 
1461  // Nested level will be an index in the nested nthreads array
1462  level = parent_team->t.t_level;
1463  // used to launch non-serial teams even if nested is not allowed
1464  active_level = parent_team->t.t_active_level;
1465  // needed to check nesting inside the teams
1466  teams_level = master_th->th.th_teams_level;
1467 #if KMP_NESTED_HOT_TEAMS
1468  p_hot_teams = &master_th->th.th_hot_teams;
1469  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1470  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1471  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1472  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1473  // it is either actual or not needed (when active_level > 0)
1474  (*p_hot_teams)[0].hot_team_nth = 1;
1475  }
1476 #endif
1477 
1478 #if OMPT_SUPPORT
1479  if (ompt_enabled.enabled) {
1480  if (ompt_enabled.ompt_callback_parallel_begin) {
1481  int team_size = master_set_numthreads
1482  ? master_set_numthreads
1483  : get__nproc_2(parent_team, master_tid);
1484  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1485  parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1486  OMPT_INVOKER(call_context), return_address);
1487  }
1488  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1489  }
1490 #endif
1491 
1492  master_th->th.th_ident = loc;
1493 
1494  if (master_th->th.th_teams_microtask && ap &&
1495  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1496  // AC: This is start of parallel that is nested inside teams construct.
1497  // The team is actual (hot), all workers are ready at the fork barrier.
1498  // No lock needed to initialize the team a bit, then free workers.
1499  parent_team->t.t_ident = loc;
1500  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1501  parent_team->t.t_argc = argc;
1502  argv = (void **)parent_team->t.t_argv;
1503  for (i = argc - 1; i >= 0; --i)
1504 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1505 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1506  *argv++ = va_arg(*ap, void *);
1507 #else
1508  *argv++ = va_arg(ap, void *);
1509 #endif
1510  // Increment our nested depth levels, but not increase the serialization
1511  if (parent_team == master_th->th.th_serial_team) {
1512  // AC: we are in serialized parallel
1513  __kmpc_serialized_parallel(loc, gtid);
1514  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1515  // AC: need this in order enquiry functions work
1516  // correctly, will restore at join time
1517  parent_team->t.t_serialized--;
1518 #if OMPT_SUPPORT
1519  void *dummy;
1520  void **exit_runtime_p;
1521 
1522  ompt_lw_taskteam_t lw_taskteam;
1523 
1524  if (ompt_enabled.enabled) {
1525  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1526  &ompt_parallel_data, return_address);
1527  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1528 
1529  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1530  // don't use lw_taskteam after linking. content was swaped
1531 
1532  /* OMPT implicit task begin */
1533  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1534  if (ompt_enabled.ompt_callback_implicit_task) {
1535  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1536  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1537  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1538  OMPT_CUR_TASK_INFO(master_th)
1539  ->thread_num = __kmp_tid_from_gtid(gtid);
1540  }
1541 
1542  /* OMPT state */
1543  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1544  } else {
1545  exit_runtime_p = &dummy;
1546  }
1547 #endif
1548 
1549  {
1550  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1551  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1552  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1553 #if OMPT_SUPPORT
1554  ,
1555  exit_runtime_p
1556 #endif
1557  );
1558  }
1559 
1560 #if OMPT_SUPPORT
1561  *exit_runtime_p = NULL;
1562  if (ompt_enabled.enabled) {
1563  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1564  if (ompt_enabled.ompt_callback_implicit_task) {
1565  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1566  ompt_scope_end, NULL, implicit_task_data, 1,
1567  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1568  }
1569  __ompt_lw_taskteam_unlink(master_th);
1570 
1571  if (ompt_enabled.ompt_callback_parallel_end) {
1572  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1573  OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1574  OMPT_INVOKER(call_context), return_address);
1575  }
1576  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1577  }
1578 #endif
1579  return TRUE;
1580  }
1581 
1582  parent_team->t.t_pkfn = microtask;
1583  parent_team->t.t_invoke = invoker;
1584  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1585  parent_team->t.t_active_level++;
1586  parent_team->t.t_level++;
1587  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1588 
1589  /* Change number of threads in the team if requested */
1590  if (master_set_numthreads) { // The parallel has num_threads clause
1591  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1592  // AC: only can reduce number of threads dynamically, can't increase
1593  kmp_info_t **other_threads = parent_team->t.t_threads;
1594  parent_team->t.t_nproc = master_set_numthreads;
1595  for (i = 0; i < master_set_numthreads; ++i) {
1596  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1597  }
1598  // Keep extra threads hot in the team for possible next parallels
1599  }
1600  master_th->th.th_set_nproc = 0;
1601  }
1602 
1603 #if USE_DEBUGGER
1604  if (__kmp_debugging) { // Let debugger override number of threads.
1605  int nth = __kmp_omp_num_threads(loc);
1606  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1607  master_set_numthreads = nth;
1608  }
1609  }
1610 #endif
1611 
1612  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1613  "master_th=%p, gtid=%d\n",
1614  root, parent_team, master_th, gtid));
1615  __kmp_internal_fork(loc, gtid, parent_team);
1616  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1617  "master_th=%p, gtid=%d\n",
1618  root, parent_team, master_th, gtid));
1619 
1620  /* Invoke microtask for MASTER thread */
1621  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1622  parent_team->t.t_id, parent_team->t.t_pkfn));
1623 
1624  if (!parent_team->t.t_invoke(gtid)) {
1625  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1626  }
1627  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1628  parent_team->t.t_id, parent_team->t.t_pkfn));
1629  KMP_MB(); /* Flush all pending memory write invalidates. */
1630 
1631  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1632 
1633  return TRUE;
1634  } // Parallel closely nested in teams construct
1635 
1636 #if KMP_DEBUG
1637  if (__kmp_tasking_mode != tskm_immediate_exec) {
1638  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1639  parent_team->t.t_task_team[master_th->th.th_task_state]);
1640  }
1641 #endif
1642 
1643  if (parent_team->t.t_active_level >=
1644  master_th->th.th_current_task->td_icvs.max_active_levels) {
1645  nthreads = 1;
1646  } else {
1647  int enter_teams = ((ap == NULL && active_level == 0) ||
1648  (ap && teams_level > 0 && teams_level == level));
1649  nthreads =
1650  master_set_numthreads
1651  ? master_set_numthreads
1652  : get__nproc_2(
1653  parent_team,
1654  master_tid); // TODO: get nproc directly from current task
1655 
1656  // Check if we need to take forkjoin lock? (no need for serialized
1657  // parallel out of teams construct). This code moved here from
1658  // __kmp_reserve_threads() to speedup nested serialized parallels.
1659  if (nthreads > 1) {
1660  if ((get__max_active_levels(master_th) == 1 &&
1661  (root->r.r_in_parallel && !enter_teams)) ||
1662  (__kmp_library == library_serial)) {
1663  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1664  " threads\n",
1665  gtid, nthreads));
1666  nthreads = 1;
1667  }
1668  }
1669  if (nthreads > 1) {
1670  /* determine how many new threads we can use */
1671  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1672  /* AC: If we execute teams from parallel region (on host), then teams
1673  should be created but each can only have 1 thread if nesting is
1674  disabled. If teams called from serial region, then teams and their
1675  threads should be created regardless of the nesting setting. */
1676  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1677  nthreads, enter_teams);
1678  if (nthreads == 1) {
1679  // Free lock for single thread execution here; for multi-thread
1680  // execution it will be freed later after team of threads created
1681  // and initialized
1682  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1683  }
1684  }
1685  }
1686  KMP_DEBUG_ASSERT(nthreads > 0);
1687 
1688  // If we temporarily changed the set number of threads then restore it now
1689  master_th->th.th_set_nproc = 0;
1690 
1691  /* create a serialized parallel region? */
1692  if (nthreads == 1) {
1693 /* josh todo: hypothetical question: what do we do for OS X*? */
1694 #if KMP_OS_LINUX && \
1695  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1696  void *args[argc];
1697 #else
1698  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1699 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1700  KMP_ARCH_AARCH64) */
1701 
1702  KA_TRACE(20,
1703  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1704 
1705  __kmpc_serialized_parallel(loc, gtid);
1706 
1707  if (call_context == fork_context_intel) {
1708  /* TODO this sucks, use the compiler itself to pass args! :) */
1709  master_th->th.th_serial_team->t.t_ident = loc;
1710  if (!ap) {
1711  // revert change made in __kmpc_serialized_parallel()
1712  master_th->th.th_serial_team->t.t_level--;
1713 // Get args from parent team for teams construct
1714 
1715 #if OMPT_SUPPORT
1716  void *dummy;
1717  void **exit_runtime_p;
1718  ompt_task_info_t *task_info;
1719 
1720  ompt_lw_taskteam_t lw_taskteam;
1721 
1722  if (ompt_enabled.enabled) {
1723  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1724  &ompt_parallel_data, return_address);
1725 
1726  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1727  // don't use lw_taskteam after linking. content was swaped
1728 
1729  task_info = OMPT_CUR_TASK_INFO(master_th);
1730  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1731  if (ompt_enabled.ompt_callback_implicit_task) {
1732  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1733  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1734  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1735  OMPT_CUR_TASK_INFO(master_th)
1736  ->thread_num = __kmp_tid_from_gtid(gtid);
1737  }
1738 
1739  /* OMPT state */
1740  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1741  } else {
1742  exit_runtime_p = &dummy;
1743  }
1744 #endif
1745 
1746  {
1747  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1748  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1749  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1750  parent_team->t.t_argv
1751 #if OMPT_SUPPORT
1752  ,
1753  exit_runtime_p
1754 #endif
1755  );
1756  }
1757 
1758 #if OMPT_SUPPORT
1759  if (ompt_enabled.enabled) {
1760  exit_runtime_p = NULL;
1761  if (ompt_enabled.ompt_callback_implicit_task) {
1762  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1763  ompt_scope_end, NULL, &(task_info->task_data), 1,
1764  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1765  }
1766 
1767  __ompt_lw_taskteam_unlink(master_th);
1768  if (ompt_enabled.ompt_callback_parallel_end) {
1769  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1770  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1771  OMPT_INVOKER(call_context), return_address);
1772  }
1773  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1774  }
1775 #endif
1776  } else if (microtask == (microtask_t)__kmp_teams_master) {
1777  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1778  master_th->th.th_serial_team);
1779  team = master_th->th.th_team;
1780  // team->t.t_pkfn = microtask;
1781  team->t.t_invoke = invoker;
1782  __kmp_alloc_argv_entries(argc, team, TRUE);
1783  team->t.t_argc = argc;
1784  argv = (void **)team->t.t_argv;
1785  if (ap) {
1786  for (i = argc - 1; i >= 0; --i)
1787 // TODO: revert workaround for Intel(R) 64 tracker #96
1788 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1789  *argv++ = va_arg(*ap, void *);
1790 #else
1791  *argv++ = va_arg(ap, void *);
1792 #endif
1793  } else {
1794  for (i = 0; i < argc; ++i)
1795  // Get args from parent team for teams construct
1796  argv[i] = parent_team->t.t_argv[i];
1797  }
1798  // AC: revert change made in __kmpc_serialized_parallel()
1799  // because initial code in teams should have level=0
1800  team->t.t_level--;
1801  // AC: call special invoker for outer "parallel" of teams construct
1802  invoker(gtid);
1803  } else {
1804  argv = args;
1805  for (i = argc - 1; i >= 0; --i)
1806 // TODO: revert workaround for Intel(R) 64 tracker #96
1807 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1808  *argv++ = va_arg(*ap, void *);
1809 #else
1810  *argv++ = va_arg(ap, void *);
1811 #endif
1812  KMP_MB();
1813 
1814 #if OMPT_SUPPORT
1815  void *dummy;
1816  void **exit_runtime_p;
1817  ompt_task_info_t *task_info;
1818 
1819  ompt_lw_taskteam_t lw_taskteam;
1820 
1821  if (ompt_enabled.enabled) {
1822  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1823  &ompt_parallel_data, return_address);
1824  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1825  // don't use lw_taskteam after linking. content was swaped
1826  task_info = OMPT_CUR_TASK_INFO(master_th);
1827  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1828 
1829  /* OMPT implicit task begin */
1830  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1831  if (ompt_enabled.ompt_callback_implicit_task) {
1832  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1833  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1834  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1835  OMPT_CUR_TASK_INFO(master_th)
1836  ->thread_num = __kmp_tid_from_gtid(gtid);
1837  }
1838 
1839  /* OMPT state */
1840  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1841  } else {
1842  exit_runtime_p = &dummy;
1843  }
1844 #endif
1845 
1846  {
1847  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1848  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1849  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1850 #if OMPT_SUPPORT
1851  ,
1852  exit_runtime_p
1853 #endif
1854  );
1855  }
1856 
1857 #if OMPT_SUPPORT
1858  if (ompt_enabled.enabled) {
1859  *exit_runtime_p = NULL;
1860  if (ompt_enabled.ompt_callback_implicit_task) {
1861  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1862  ompt_scope_end, NULL, &(task_info->task_data), 1,
1863  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1864  }
1865 
1866  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1867  __ompt_lw_taskteam_unlink(master_th);
1868  if (ompt_enabled.ompt_callback_parallel_end) {
1869  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1870  &ompt_parallel_data, parent_task_data,
1871  OMPT_INVOKER(call_context), return_address);
1872  }
1873  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874  }
1875 #endif
1876  }
1877  } else if (call_context == fork_context_gnu) {
1878 #if OMPT_SUPPORT
1879  ompt_lw_taskteam_t lwt;
1880  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1881  return_address);
1882 
1883  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1884  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1885 // don't use lw_taskteam after linking. content was swaped
1886 #endif
1887 
1888  // we were called from GNU native code
1889  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1890  return FALSE;
1891  } else {
1892  KMP_ASSERT2(call_context < fork_context_last,
1893  "__kmp_fork_call: unknown fork_context parameter");
1894  }
1895 
1896  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1897  KMP_MB();
1898  return FALSE;
1899  } // if (nthreads == 1)
1900 
1901  // GEH: only modify the executing flag in the case when not serialized
1902  // serialized case is handled in kmpc_serialized_parallel
1903  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1904  "curtask=%p, curtask_max_aclevel=%d\n",
1905  parent_team->t.t_active_level, master_th,
1906  master_th->th.th_current_task,
1907  master_th->th.th_current_task->td_icvs.max_active_levels));
1908  // TODO: GEH - cannot do this assertion because root thread not set up as
1909  // executing
1910  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1911  master_th->th.th_current_task->td_flags.executing = 0;
1912 
1913  if (!master_th->th.th_teams_microtask || level > teams_level) {
1914  /* Increment our nested depth level */
1915  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1916  }
1917 
1918  // See if we need to make a copy of the ICVs.
1919  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1920  if ((level + 1 < __kmp_nested_nth.used) &&
1921  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1922  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1923  } else {
1924  nthreads_icv = 0; // don't update
1925  }
1926 
1927  // Figure out the proc_bind_policy for the new team.
1928  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1929  kmp_proc_bind_t proc_bind_icv =
1930  proc_bind_default; // proc_bind_default means don't update
1931  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1932  proc_bind = proc_bind_false;
1933  } else {
1934  if (proc_bind == proc_bind_default) {
1935  // No proc_bind clause specified; use current proc-bind-var for this
1936  // parallel region
1937  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1938  }
1939  /* else: The proc_bind policy was specified explicitly on parallel clause.
1940  This overrides proc-bind-var for this parallel region, but does not
1941  change proc-bind-var. */
1942  // Figure the value of proc-bind-var for the child threads.
1943  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1944  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1945  master_th->th.th_current_task->td_icvs.proc_bind)) {
1946  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1947  }
1948  }
1949 
1950  // Reset for next parallel region
1951  master_th->th.th_set_proc_bind = proc_bind_default;
1952 
1953  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1954  kmp_internal_control_t new_icvs;
1955  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1956  new_icvs.next = NULL;
1957  if (nthreads_icv > 0) {
1958  new_icvs.nproc = nthreads_icv;
1959  }
1960  if (proc_bind_icv != proc_bind_default) {
1961  new_icvs.proc_bind = proc_bind_icv;
1962  }
1963 
1964  /* allocate a new parallel team */
1965  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1966  team = __kmp_allocate_team(root, nthreads, nthreads,
1967 #if OMPT_SUPPORT
1968  ompt_parallel_data,
1969 #endif
1970  proc_bind, &new_icvs,
1971  argc USE_NESTED_HOT_ARG(master_th));
1972  } else {
1973  /* allocate a new parallel team */
1974  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1975  team = __kmp_allocate_team(root, nthreads, nthreads,
1976 #if OMPT_SUPPORT
1977  ompt_parallel_data,
1978 #endif
1979  proc_bind,
1980  &master_th->th.th_current_task->td_icvs,
1981  argc USE_NESTED_HOT_ARG(master_th));
1982  }
1983  KF_TRACE(
1984  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1985 
1986  /* setup the new team */
1987  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
1988  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
1989  KMP_CHECK_UPDATE(team->t.t_ident, loc);
1990  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
1991  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
1992 #if OMPT_SUPPORT
1993  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
1994  return_address);
1995 #endif
1996  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
1997  // TODO: parent_team->t.t_level == INT_MAX ???
1998  if (!master_th->th.th_teams_microtask || level > teams_level) {
1999  int new_level = parent_team->t.t_level + 1;
2000  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2001  new_level = parent_team->t.t_active_level + 1;
2002  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2003  } else {
2004  // AC: Do not increase parallel level at start of the teams construct
2005  int new_level = parent_team->t.t_level;
2006  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2007  new_level = parent_team->t.t_active_level;
2008  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2009  }
2010  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2011  // set master's schedule as new run-time schedule
2012  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2013 
2014  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2015  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2016 
2017  // Update the floating point rounding in the team if required.
2018  propagateFPControl(team);
2019 
2020  if (__kmp_tasking_mode != tskm_immediate_exec) {
2021  // Set master's task team to team's task team. Unless this is hot team, it
2022  // should be NULL.
2023  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2024  parent_team->t.t_task_team[master_th->th.th_task_state]);
2025  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2026  "%p, new task_team %p / team %p\n",
2027  __kmp_gtid_from_thread(master_th),
2028  master_th->th.th_task_team, parent_team,
2029  team->t.t_task_team[master_th->th.th_task_state], team));
2030 
2031  if (active_level || master_th->th.th_task_team) {
2032  // Take a memo of master's task_state
2033  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2034  if (master_th->th.th_task_state_top >=
2035  master_th->th.th_task_state_stack_sz) { // increase size
2036  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2037  kmp_uint8 *old_stack, *new_stack;
2038  kmp_uint32 i;
2039  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2040  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2041  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2042  }
2043  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2044  ++i) { // zero-init rest of stack
2045  new_stack[i] = 0;
2046  }
2047  old_stack = master_th->th.th_task_state_memo_stack;
2048  master_th->th.th_task_state_memo_stack = new_stack;
2049  master_th->th.th_task_state_stack_sz = new_size;
2050  __kmp_free(old_stack);
2051  }
2052  // Store master's task_state on stack
2053  master_th->th
2054  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2055  master_th->th.th_task_state;
2056  master_th->th.th_task_state_top++;
2057 #if KMP_NESTED_HOT_TEAMS
2058  if (master_th->th.th_hot_teams &&
2059  active_level < __kmp_hot_teams_max_level &&
2060  team == master_th->th.th_hot_teams[active_level].hot_team) {
2061  // Restore master's nested state if nested hot team
2062  master_th->th.th_task_state =
2063  master_th->th
2064  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2065  } else {
2066 #endif
2067  master_th->th.th_task_state = 0;
2068 #if KMP_NESTED_HOT_TEAMS
2069  }
2070 #endif
2071  }
2072 #if !KMP_NESTED_HOT_TEAMS
2073  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2074  (team == root->r.r_hot_team));
2075 #endif
2076  }
2077 
2078  KA_TRACE(
2079  20,
2080  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2081  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2082  team->t.t_nproc));
2083  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2084  (team->t.t_master_tid == 0 &&
2085  (team->t.t_parent == root->r.r_root_team ||
2086  team->t.t_parent->t.t_serialized)));
2087  KMP_MB();
2088 
2089  /* now, setup the arguments */
2090  argv = (void **)team->t.t_argv;
2091  if (ap) {
2092  for (i = argc - 1; i >= 0; --i) {
2093 // TODO: revert workaround for Intel(R) 64 tracker #96
2094 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2095  void *new_argv = va_arg(*ap, void *);
2096 #else
2097  void *new_argv = va_arg(ap, void *);
2098 #endif
2099  KMP_CHECK_UPDATE(*argv, new_argv);
2100  argv++;
2101  }
2102  } else {
2103  for (i = 0; i < argc; ++i) {
2104  // Get args from parent team for teams construct
2105  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2106  }
2107  }
2108 
2109  /* now actually fork the threads */
2110  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2111  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2112  root->r.r_active = TRUE;
2113 
2114  __kmp_fork_team_threads(root, team, master_th, gtid);
2115  __kmp_setup_icv_copy(team, nthreads,
2116  &master_th->th.th_current_task->td_icvs, loc);
2117 
2118 #if OMPT_SUPPORT
2119  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2120 #endif
2121 
2122  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2123 
2124 #if USE_ITT_BUILD
2125  if (team->t.t_active_level == 1 // only report frames at level 1
2126  && !master_th->th.th_teams_microtask) { // not in teams construct
2127 #if USE_ITT_NOTIFY
2128  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2129  (__kmp_forkjoin_frames_mode == 3 ||
2130  __kmp_forkjoin_frames_mode == 1)) {
2131  kmp_uint64 tmp_time = 0;
2132  if (__itt_get_timestamp_ptr)
2133  tmp_time = __itt_get_timestamp();
2134  // Internal fork - report frame begin
2135  master_th->th.th_frame_time = tmp_time;
2136  if (__kmp_forkjoin_frames_mode == 3)
2137  team->t.t_region_time = tmp_time;
2138  } else
2139 // only one notification scheme (either "submit" or "forking/joined", not both)
2140 #endif /* USE_ITT_NOTIFY */
2141  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2142  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2143  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2144  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2145  }
2146  }
2147 #endif /* USE_ITT_BUILD */
2148 
2149  /* now go on and do the work */
2150  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2151  KMP_MB();
2152  KF_TRACE(10,
2153  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2154  root, team, master_th, gtid));
2155 
2156 #if USE_ITT_BUILD
2157  if (__itt_stack_caller_create_ptr) {
2158  team->t.t_stack_id =
2159  __kmp_itt_stack_caller_create(); // create new stack stitching id
2160  // before entering fork barrier
2161  }
2162 #endif /* USE_ITT_BUILD */
2163 
2164  // AC: skip __kmp_internal_fork at teams construct, let only master
2165  // threads execute
2166  if (ap) {
2167  __kmp_internal_fork(loc, gtid, team);
2168  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2169  "master_th=%p, gtid=%d\n",
2170  root, team, master_th, gtid));
2171  }
2172 
2173  if (call_context == fork_context_gnu) {
2174  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2175  return TRUE;
2176  }
2177 
2178  /* Invoke microtask for MASTER thread */
2179  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2180  team->t.t_id, team->t.t_pkfn));
2181  } // END of timer KMP_fork_call block
2182 
2183 #if KMP_STATS_ENABLED
2184  // If beginning a teams construct, then change thread state
2185  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2186  if (!ap) {
2187  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2188  }
2189 #endif
2190 
2191  if (!team->t.t_invoke(gtid)) {
2192  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2193  }
2194 
2195 #if KMP_STATS_ENABLED
2196  // If was beginning of a teams construct, then reset thread state
2197  if (!ap) {
2198  KMP_SET_THREAD_STATE(previous_state);
2199  }
2200 #endif
2201 
2202  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2203  team->t.t_id, team->t.t_pkfn));
2204  KMP_MB(); /* Flush all pending memory write invalidates. */
2205 
2206  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2207 
2208 #if OMPT_SUPPORT
2209  if (ompt_enabled.enabled) {
2210  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2211  }
2212 #endif
2213 
2214  return TRUE;
2215 }
2216 
2217 #if OMPT_SUPPORT
2218 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2219  kmp_team_t *team) {
2220  // restore state outside the region
2221  thread->th.ompt_thread_info.state =
2222  ((team->t.t_serialized) ? ompt_state_work_serial
2223  : ompt_state_work_parallel);
2224 }
2225 
2226 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2227  kmp_team_t *team, ompt_data_t *parallel_data,
2228  fork_context_e fork_context, void *codeptr) {
2229  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2230  if (ompt_enabled.ompt_callback_parallel_end) {
2231  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2232  parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2233  codeptr);
2234  }
2235 
2236  task_info->frame.enter_frame = ompt_data_none;
2237  __kmp_join_restore_state(thread, team);
2238 }
2239 #endif
2240 
2241 void __kmp_join_call(ident_t *loc, int gtid
2242 #if OMPT_SUPPORT
2243  ,
2244  enum fork_context_e fork_context
2245 #endif
2246  ,
2247  int exit_teams) {
2248  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2249  kmp_team_t *team;
2250  kmp_team_t *parent_team;
2251  kmp_info_t *master_th;
2252  kmp_root_t *root;
2253  int master_active;
2254 
2255  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2256 
2257  /* setup current data */
2258  master_th = __kmp_threads[gtid];
2259  root = master_th->th.th_root;
2260  team = master_th->th.th_team;
2261  parent_team = team->t.t_parent;
2262 
2263  master_th->th.th_ident = loc;
2264 
2265 #if OMPT_SUPPORT
2266  if (ompt_enabled.enabled) {
2267  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2268  }
2269 #endif
2270 
2271 #if KMP_DEBUG
2272  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2273  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2274  "th_task_team = %p\n",
2275  __kmp_gtid_from_thread(master_th), team,
2276  team->t.t_task_team[master_th->th.th_task_state],
2277  master_th->th.th_task_team));
2278  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2279  team->t.t_task_team[master_th->th.th_task_state]);
2280  }
2281 #endif
2282 
2283  if (team->t.t_serialized) {
2284  if (master_th->th.th_teams_microtask) {
2285  // We are in teams construct
2286  int level = team->t.t_level;
2287  int tlevel = master_th->th.th_teams_level;
2288  if (level == tlevel) {
2289  // AC: we haven't incremented it earlier at start of teams construct,
2290  // so do it here - at the end of teams construct
2291  team->t.t_level++;
2292  } else if (level == tlevel + 1) {
2293  // AC: we are exiting parallel inside teams, need to increment
2294  // serialization in order to restore it in the next call to
2295  // __kmpc_end_serialized_parallel
2296  team->t.t_serialized++;
2297  }
2298  }
2299  __kmpc_end_serialized_parallel(loc, gtid);
2300 
2301 #if OMPT_SUPPORT
2302  if (ompt_enabled.enabled) {
2303  __kmp_join_restore_state(master_th, parent_team);
2304  }
2305 #endif
2306 
2307  return;
2308  }
2309 
2310  master_active = team->t.t_master_active;
2311 
2312  if (!exit_teams) {
2313  // AC: No barrier for internal teams at exit from teams construct.
2314  // But there is barrier for external team (league).
2315  __kmp_internal_join(loc, gtid, team);
2316  } else {
2317  master_th->th.th_task_state =
2318  0; // AC: no tasking in teams (out of any parallel)
2319  }
2320 
2321  KMP_MB();
2322 
2323 #if OMPT_SUPPORT
2324  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2325  void *codeptr = team->t.ompt_team_info.master_return_address;
2326 #endif
2327 
2328 #if USE_ITT_BUILD
2329  if (__itt_stack_caller_create_ptr) {
2330  __kmp_itt_stack_caller_destroy(
2331  (__itt_caller)team->t
2332  .t_stack_id); // destroy the stack stitching id after join barrier
2333  }
2334 
2335  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2336  if (team->t.t_active_level == 1 &&
2337  !master_th->th.th_teams_microtask) { /* not in teams construct */
2338  master_th->th.th_ident = loc;
2339  // only one notification scheme (either "submit" or "forking/joined", not
2340  // both)
2341  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2342  __kmp_forkjoin_frames_mode == 3)
2343  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2344  master_th->th.th_frame_time, 0, loc,
2345  master_th->th.th_team_nproc, 1);
2346  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2347  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2348  __kmp_itt_region_joined(gtid);
2349  } // active_level == 1
2350 #endif /* USE_ITT_BUILD */
2351 
2352  if (master_th->th.th_teams_microtask && !exit_teams &&
2353  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2354  team->t.t_level == master_th->th.th_teams_level + 1) {
2355  // AC: We need to leave the team structure intact at the end of parallel
2356  // inside the teams construct, so that at the next parallel same (hot) team
2357  // works, only adjust nesting levels
2358 
2359  /* Decrement our nested depth level */
2360  team->t.t_level--;
2361  team->t.t_active_level--;
2362  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2363 
2364  // Restore number of threads in the team if needed. This code relies on
2365  // the proper adjustment of th_teams_size.nth after the fork in
2366  // __kmp_teams_master on each teams master in the case that
2367  // __kmp_reserve_threads reduced it.
2368  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2369  int old_num = master_th->th.th_team_nproc;
2370  int new_num = master_th->th.th_teams_size.nth;
2371  kmp_info_t **other_threads = team->t.t_threads;
2372  team->t.t_nproc = new_num;
2373  for (int i = 0; i < old_num; ++i) {
2374  other_threads[i]->th.th_team_nproc = new_num;
2375  }
2376  // Adjust states of non-used threads of the team
2377  for (int i = old_num; i < new_num; ++i) {
2378  // Re-initialize thread's barrier data.
2379  KMP_DEBUG_ASSERT(other_threads[i]);
2380  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2381  for (int b = 0; b < bs_last_barrier; ++b) {
2382  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2383  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2384 #if USE_DEBUGGER
2385  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2386 #endif
2387  }
2388  if (__kmp_tasking_mode != tskm_immediate_exec) {
2389  // Synchronize thread's task state
2390  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2391  }
2392  }
2393  }
2394 
2395 #if OMPT_SUPPORT
2396  if (ompt_enabled.enabled) {
2397  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2398  codeptr);
2399  }
2400 #endif
2401 
2402  return;
2403  }
2404 
2405  /* do cleanup and restore the parent team */
2406  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2407  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2408 
2409  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2410 
2411  /* jc: The following lock has instructions with REL and ACQ semantics,
2412  separating the parallel user code called in this parallel region
2413  from the serial user code called after this function returns. */
2414  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2415 
2416  if (!master_th->th.th_teams_microtask ||
2417  team->t.t_level > master_th->th.th_teams_level) {
2418  /* Decrement our nested depth level */
2419  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2420  }
2421  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2422 
2423 #if OMPT_SUPPORT
2424  if (ompt_enabled.enabled) {
2425  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426  if (ompt_enabled.ompt_callback_implicit_task) {
2427  int ompt_team_size = team->t.t_nproc;
2428  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2429  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2430  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2431  }
2432 
2433  task_info->frame.exit_frame = ompt_data_none;
2434  task_info->task_data = ompt_data_none;
2435  }
2436 #endif
2437 
2438  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2439  master_th, team));
2440  __kmp_pop_current_task_from_thread(master_th);
2441 
2442 #if KMP_AFFINITY_SUPPORTED
2443  // Restore master thread's partition.
2444  master_th->th.th_first_place = team->t.t_first_place;
2445  master_th->th.th_last_place = team->t.t_last_place;
2446 #endif // KMP_AFFINITY_SUPPORTED
2447  master_th->th.th_def_allocator = team->t.t_def_allocator;
2448 
2449  updateHWFPControl(team);
2450 
2451  if (root->r.r_active != master_active)
2452  root->r.r_active = master_active;
2453 
2454  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2455  master_th)); // this will free worker threads
2456 
2457  /* this race was fun to find. make sure the following is in the critical
2458  region otherwise assertions may fail occasionally since the old team may be
2459  reallocated and the hierarchy appears inconsistent. it is actually safe to
2460  run and won't cause any bugs, but will cause those assertion failures. it's
2461  only one deref&assign so might as well put this in the critical region */
2462  master_th->th.th_team = parent_team;
2463  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2464  master_th->th.th_team_master = parent_team->t.t_threads[0];
2465  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2466 
2467  /* restore serialized team, if need be */
2468  if (parent_team->t.t_serialized &&
2469  parent_team != master_th->th.th_serial_team &&
2470  parent_team != root->r.r_root_team) {
2471  __kmp_free_team(root,
2472  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2473  master_th->th.th_serial_team = parent_team;
2474  }
2475 
2476  if (__kmp_tasking_mode != tskm_immediate_exec) {
2477  if (master_th->th.th_task_state_top >
2478  0) { // Restore task state from memo stack
2479  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2480  // Remember master's state if we re-use this nested hot team
2481  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2482  master_th->th.th_task_state;
2483  --master_th->th.th_task_state_top; // pop
2484  // Now restore state at this level
2485  master_th->th.th_task_state =
2486  master_th->th
2487  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2488  }
2489  // Copy the task team from the parent team to the master thread
2490  master_th->th.th_task_team =
2491  parent_team->t.t_task_team[master_th->th.th_task_state];
2492  KA_TRACE(20,
2493  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2494  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2495  parent_team));
2496  }
2497 
2498  // TODO: GEH - cannot do this assertion because root thread not set up as
2499  // executing
2500  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2501  master_th->th.th_current_task->td_flags.executing = 1;
2502 
2503  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2504 
2505 #if OMPT_SUPPORT
2506  if (ompt_enabled.enabled) {
2507  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2508  codeptr);
2509  }
2510 #endif
2511 
2512  KMP_MB();
2513  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2514 }
2515 
2516 /* Check whether we should push an internal control record onto the
2517  serial team stack. If so, do it. */
2518 void __kmp_save_internal_controls(kmp_info_t *thread) {
2519 
2520  if (thread->th.th_team != thread->th.th_serial_team) {
2521  return;
2522  }
2523  if (thread->th.th_team->t.t_serialized > 1) {
2524  int push = 0;
2525 
2526  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2527  push = 1;
2528  } else {
2529  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2530  thread->th.th_team->t.t_serialized) {
2531  push = 1;
2532  }
2533  }
2534  if (push) { /* push a record on the serial team's stack */
2535  kmp_internal_control_t *control =
2536  (kmp_internal_control_t *)__kmp_allocate(
2537  sizeof(kmp_internal_control_t));
2538 
2539  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2540 
2541  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2542 
2543  control->next = thread->th.th_team->t.t_control_stack_top;
2544  thread->th.th_team->t.t_control_stack_top = control;
2545  }
2546  }
2547 }
2548 
2549 /* Changes set_nproc */
2550 void __kmp_set_num_threads(int new_nth, int gtid) {
2551  kmp_info_t *thread;
2552  kmp_root_t *root;
2553 
2554  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2555  KMP_DEBUG_ASSERT(__kmp_init_serial);
2556 
2557  if (new_nth < 1)
2558  new_nth = 1;
2559  else if (new_nth > __kmp_max_nth)
2560  new_nth = __kmp_max_nth;
2561 
2562  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2563  thread = __kmp_threads[gtid];
2564  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2565  return; // nothing to do
2566 
2567  __kmp_save_internal_controls(thread);
2568 
2569  set__nproc(thread, new_nth);
2570 
2571  // If this omp_set_num_threads() call will cause the hot team size to be
2572  // reduced (in the absence of a num_threads clause), then reduce it now,
2573  // rather than waiting for the next parallel region.
2574  root = thread->th.th_root;
2575  if (__kmp_init_parallel && (!root->r.r_active) &&
2576  (root->r.r_hot_team->t.t_nproc > new_nth)
2577 #if KMP_NESTED_HOT_TEAMS
2578  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2579 #endif
2580  ) {
2581  kmp_team_t *hot_team = root->r.r_hot_team;
2582  int f;
2583 
2584  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2585 
2586  // Release the extra threads we don't need any more.
2587  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2588  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2589  if (__kmp_tasking_mode != tskm_immediate_exec) {
2590  // When decreasing team size, threads no longer in the team should unref
2591  // task team.
2592  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2593  }
2594  __kmp_free_thread(hot_team->t.t_threads[f]);
2595  hot_team->t.t_threads[f] = NULL;
2596  }
2597  hot_team->t.t_nproc = new_nth;
2598 #if KMP_NESTED_HOT_TEAMS
2599  if (thread->th.th_hot_teams) {
2600  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2601  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2602  }
2603 #endif
2604 
2605  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2606 
2607  // Update the t_nproc field in the threads that are still active.
2608  for (f = 0; f < new_nth; f++) {
2609  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2610  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2611  }
2612  // Special flag in case omp_set_num_threads() call
2613  hot_team->t.t_size_changed = -1;
2614  }
2615 }
2616 
2617 /* Changes max_active_levels */
2618 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2619  kmp_info_t *thread;
2620 
2621  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2622  "%d = (%d)\n",
2623  gtid, max_active_levels));
2624  KMP_DEBUG_ASSERT(__kmp_init_serial);
2625 
2626  // validate max_active_levels
2627  if (max_active_levels < 0) {
2628  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2629  // We ignore this call if the user has specified a negative value.
2630  // The current setting won't be changed. The last valid setting will be
2631  // used. A warning will be issued (if warnings are allowed as controlled by
2632  // the KMP_WARNINGS env var).
2633  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2634  "max_active_levels for thread %d = (%d)\n",
2635  gtid, max_active_levels));
2636  return;
2637  }
2638  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2639  // it's OK, the max_active_levels is within the valid range: [ 0;
2640  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2641  // We allow a zero value. (implementation defined behavior)
2642  } else {
2643  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2644  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2645  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2646  // Current upper limit is MAX_INT. (implementation defined behavior)
2647  // If the input exceeds the upper limit, we correct the input to be the
2648  // upper limit. (implementation defined behavior)
2649  // Actually, the flow should never get here until we use MAX_INT limit.
2650  }
2651  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2652  "max_active_levels for thread %d = (%d)\n",
2653  gtid, max_active_levels));
2654 
2655  thread = __kmp_threads[gtid];
2656 
2657  __kmp_save_internal_controls(thread);
2658 
2659  set__max_active_levels(thread, max_active_levels);
2660 }
2661 
2662 /* Gets max_active_levels */
2663 int __kmp_get_max_active_levels(int gtid) {
2664  kmp_info_t *thread;
2665 
2666  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2667  KMP_DEBUG_ASSERT(__kmp_init_serial);
2668 
2669  thread = __kmp_threads[gtid];
2670  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2671  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2672  "curtask_maxaclevel=%d\n",
2673  gtid, thread->th.th_current_task,
2674  thread->th.th_current_task->td_icvs.max_active_levels));
2675  return thread->th.th_current_task->td_icvs.max_active_levels;
2676 }
2677 
2678 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2679 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2680 
2681 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2682 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2683  kmp_info_t *thread;
2684  kmp_sched_t orig_kind;
2685  // kmp_team_t *team;
2686 
2687  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2688  gtid, (int)kind, chunk));
2689  KMP_DEBUG_ASSERT(__kmp_init_serial);
2690 
2691  // Check if the kind parameter is valid, correct if needed.
2692  // Valid parameters should fit in one of two intervals - standard or extended:
2693  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2694  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2695  orig_kind = kind;
2696  kind = __kmp_sched_without_mods(kind);
2697 
2698  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2699  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2700  // TODO: Hint needs attention in case we change the default schedule.
2701  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2702  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2703  __kmp_msg_null);
2704  kind = kmp_sched_default;
2705  chunk = 0; // ignore chunk value in case of bad kind
2706  }
2707 
2708  thread = __kmp_threads[gtid];
2709 
2710  __kmp_save_internal_controls(thread);
2711 
2712  if (kind < kmp_sched_upper_std) {
2713  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2714  // differ static chunked vs. unchunked: chunk should be invalid to
2715  // indicate unchunked schedule (which is the default)
2716  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2717  } else {
2718  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2719  __kmp_sch_map[kind - kmp_sched_lower - 1];
2720  }
2721  } else {
2722  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2723  // kmp_sched_lower - 2 ];
2724  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2725  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2726  kmp_sched_lower - 2];
2727  }
2728  __kmp_sched_apply_mods_intkind(
2729  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2730  if (kind == kmp_sched_auto || chunk < 1) {
2731  // ignore parameter chunk for schedule auto
2732  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2733  } else {
2734  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2735  }
2736 }
2737 
2738 /* Gets def_sched_var ICV values */
2739 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2740  kmp_info_t *thread;
2741  enum sched_type th_type;
2742 
2743  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2744  KMP_DEBUG_ASSERT(__kmp_init_serial);
2745 
2746  thread = __kmp_threads[gtid];
2747 
2748  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2749  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2750  case kmp_sch_static:
2751  case kmp_sch_static_greedy:
2752  case kmp_sch_static_balanced:
2753  *kind = kmp_sched_static;
2754  __kmp_sched_apply_mods_stdkind(kind, th_type);
2755  *chunk = 0; // chunk was not set, try to show this fact via zero value
2756  return;
2757  case kmp_sch_static_chunked:
2758  *kind = kmp_sched_static;
2759  break;
2760  case kmp_sch_dynamic_chunked:
2761  *kind = kmp_sched_dynamic;
2762  break;
2764  case kmp_sch_guided_iterative_chunked:
2765  case kmp_sch_guided_analytical_chunked:
2766  *kind = kmp_sched_guided;
2767  break;
2768  case kmp_sch_auto:
2769  *kind = kmp_sched_auto;
2770  break;
2771  case kmp_sch_trapezoidal:
2772  *kind = kmp_sched_trapezoidal;
2773  break;
2774 #if KMP_STATIC_STEAL_ENABLED
2775  case kmp_sch_static_steal:
2776  *kind = kmp_sched_static_steal;
2777  break;
2778 #endif
2779  default:
2780  KMP_FATAL(UnknownSchedulingType, th_type);
2781  }
2782 
2783  __kmp_sched_apply_mods_stdkind(kind, th_type);
2784  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2785 }
2786 
2787 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2788 
2789  int ii, dd;
2790  kmp_team_t *team;
2791  kmp_info_t *thr;
2792 
2793  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2794  KMP_DEBUG_ASSERT(__kmp_init_serial);
2795 
2796  // validate level
2797  if (level == 0)
2798  return 0;
2799  if (level < 0)
2800  return -1;
2801  thr = __kmp_threads[gtid];
2802  team = thr->th.th_team;
2803  ii = team->t.t_level;
2804  if (level > ii)
2805  return -1;
2806 
2807  if (thr->th.th_teams_microtask) {
2808  // AC: we are in teams region where multiple nested teams have same level
2809  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2810  if (level <=
2811  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2812  KMP_DEBUG_ASSERT(ii >= tlevel);
2813  // AC: As we need to pass by the teams league, we need to artificially
2814  // increase ii
2815  if (ii == tlevel) {
2816  ii += 2; // three teams have same level
2817  } else {
2818  ii++; // two teams have same level
2819  }
2820  }
2821  }
2822 
2823  if (ii == level)
2824  return __kmp_tid_from_gtid(gtid);
2825 
2826  dd = team->t.t_serialized;
2827  level++;
2828  while (ii > level) {
2829  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2830  }
2831  if ((team->t.t_serialized) && (!dd)) {
2832  team = team->t.t_parent;
2833  continue;
2834  }
2835  if (ii > level) {
2836  team = team->t.t_parent;
2837  dd = team->t.t_serialized;
2838  ii--;
2839  }
2840  }
2841 
2842  return (dd > 1) ? (0) : (team->t.t_master_tid);
2843 }
2844 
2845 int __kmp_get_team_size(int gtid, int level) {
2846 
2847  int ii, dd;
2848  kmp_team_t *team;
2849  kmp_info_t *thr;
2850 
2851  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2852  KMP_DEBUG_ASSERT(__kmp_init_serial);
2853 
2854  // validate level
2855  if (level == 0)
2856  return 1;
2857  if (level < 0)
2858  return -1;
2859  thr = __kmp_threads[gtid];
2860  team = thr->th.th_team;
2861  ii = team->t.t_level;
2862  if (level > ii)
2863  return -1;
2864 
2865  if (thr->th.th_teams_microtask) {
2866  // AC: we are in teams region where multiple nested teams have same level
2867  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2868  if (level <=
2869  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2870  KMP_DEBUG_ASSERT(ii >= tlevel);
2871  // AC: As we need to pass by the teams league, we need to artificially
2872  // increase ii
2873  if (ii == tlevel) {
2874  ii += 2; // three teams have same level
2875  } else {
2876  ii++; // two teams have same level
2877  }
2878  }
2879  }
2880 
2881  while (ii > level) {
2882  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2883  }
2884  if (team->t.t_serialized && (!dd)) {
2885  team = team->t.t_parent;
2886  continue;
2887  }
2888  if (ii > level) {
2889  team = team->t.t_parent;
2890  ii--;
2891  }
2892  }
2893 
2894  return team->t.t_nproc;
2895 }
2896 
2897 kmp_r_sched_t __kmp_get_schedule_global() {
2898  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2899  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2900  // independently. So one can get the updated schedule here.
2901 
2902  kmp_r_sched_t r_sched;
2903 
2904  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2905  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2906  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2907  // different roots (even in OMP 2.5)
2908  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2909  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2910  if (s == kmp_sch_static) {
2911  // replace STATIC with more detailed schedule (balanced or greedy)
2912  r_sched.r_sched_type = __kmp_static;
2913  } else if (s == kmp_sch_guided_chunked) {
2914  // replace GUIDED with more detailed schedule (iterative or analytical)
2915  r_sched.r_sched_type = __kmp_guided;
2916  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2917  r_sched.r_sched_type = __kmp_sched;
2918  }
2919  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2920 
2921  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2922  // __kmp_chunk may be wrong here (if it was not ever set)
2923  r_sched.chunk = KMP_DEFAULT_CHUNK;
2924  } else {
2925  r_sched.chunk = __kmp_chunk;
2926  }
2927 
2928  return r_sched;
2929 }
2930 
2931 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2932  at least argc number of *t_argv entries for the requested team. */
2933 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2934 
2935  KMP_DEBUG_ASSERT(team);
2936  if (!realloc || argc > team->t.t_max_argc) {
2937 
2938  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2939  "current entries=%d\n",
2940  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2941  /* if previously allocated heap space for args, free them */
2942  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2943  __kmp_free((void *)team->t.t_argv);
2944 
2945  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2946  /* use unused space in the cache line for arguments */
2947  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2948  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2949  "argv entries\n",
2950  team->t.t_id, team->t.t_max_argc));
2951  team->t.t_argv = &team->t.t_inline_argv[0];
2952  if (__kmp_storage_map) {
2953  __kmp_print_storage_map_gtid(
2954  -1, &team->t.t_inline_argv[0],
2955  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2956  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
2957  team->t.t_id);
2958  }
2959  } else {
2960  /* allocate space for arguments in the heap */
2961  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
2962  ? KMP_MIN_MALLOC_ARGV_ENTRIES
2963  : 2 * argc;
2964  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
2965  "argv entries\n",
2966  team->t.t_id, team->t.t_max_argc));
2967  team->t.t_argv =
2968  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
2969  if (__kmp_storage_map) {
2970  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
2971  &team->t.t_argv[team->t.t_max_argc],
2972  sizeof(void *) * team->t.t_max_argc,
2973  "team_%d.t_argv", team->t.t_id);
2974  }
2975  }
2976  }
2977 }
2978 
2979 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
2980  int i;
2981  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2982  team->t.t_threads =
2983  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
2984  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
2985  sizeof(dispatch_shared_info_t) * num_disp_buff);
2986  team->t.t_dispatch =
2987  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
2988  team->t.t_implicit_task_taskdata =
2989  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
2990  team->t.t_max_nproc = max_nth;
2991 
2992  /* setup dispatch buffers */
2993  for (i = 0; i < num_disp_buff; ++i) {
2994  team->t.t_disp_buffer[i].buffer_index = i;
2995  team->t.t_disp_buffer[i].doacross_buf_idx = i;
2996  }
2997 }
2998 
2999 static void __kmp_free_team_arrays(kmp_team_t *team) {
3000  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3001  int i;
3002  for (i = 0; i < team->t.t_max_nproc; ++i) {
3003  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3004  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3005  team->t.t_dispatch[i].th_disp_buffer = NULL;
3006  }
3007  }
3008 #if KMP_USE_HIER_SCHED
3009  __kmp_dispatch_free_hierarchies(team);
3010 #endif
3011  __kmp_free(team->t.t_threads);
3012  __kmp_free(team->t.t_disp_buffer);
3013  __kmp_free(team->t.t_dispatch);
3014  __kmp_free(team->t.t_implicit_task_taskdata);
3015  team->t.t_threads = NULL;
3016  team->t.t_disp_buffer = NULL;
3017  team->t.t_dispatch = NULL;
3018  team->t.t_implicit_task_taskdata = 0;
3019 }
3020 
3021 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3022  kmp_info_t **oldThreads = team->t.t_threads;
3023 
3024  __kmp_free(team->t.t_disp_buffer);
3025  __kmp_free(team->t.t_dispatch);
3026  __kmp_free(team->t.t_implicit_task_taskdata);
3027  __kmp_allocate_team_arrays(team, max_nth);
3028 
3029  KMP_MEMCPY(team->t.t_threads, oldThreads,
3030  team->t.t_nproc * sizeof(kmp_info_t *));
3031 
3032  __kmp_free(oldThreads);
3033 }
3034 
3035 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3036 
3037  kmp_r_sched_t r_sched =
3038  __kmp_get_schedule_global(); // get current state of scheduling globals
3039 
3040  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3041 
3042  kmp_internal_control_t g_icvs = {
3043  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3044  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3045  // adjustment of threads (per thread)
3046  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3047  // whether blocktime is explicitly set
3048  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3049 #if KMP_USE_MONITOR
3050  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3051 // intervals
3052 #endif
3053  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3054  // next parallel region (per thread)
3055  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3056  __kmp_cg_max_nth, // int thread_limit;
3057  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3058  // for max_active_levels
3059  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3060  // {sched,chunk} pair
3061  __kmp_nested_proc_bind.bind_types[0],
3062  __kmp_default_device,
3063  NULL // struct kmp_internal_control *next;
3064  };
3065 
3066  return g_icvs;
3067 }
3068 
3069 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3070 
3071  kmp_internal_control_t gx_icvs;
3072  gx_icvs.serial_nesting_level =
3073  0; // probably =team->t.t_serial like in save_inter_controls
3074  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3075  gx_icvs.next = NULL;
3076 
3077  return gx_icvs;
3078 }
3079 
3080 static void __kmp_initialize_root(kmp_root_t *root) {
3081  int f;
3082  kmp_team_t *root_team;
3083  kmp_team_t *hot_team;
3084  int hot_team_max_nth;
3085  kmp_r_sched_t r_sched =
3086  __kmp_get_schedule_global(); // get current state of scheduling globals
3087  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3088  KMP_DEBUG_ASSERT(root);
3089  KMP_ASSERT(!root->r.r_begin);
3090 
3091  /* setup the root state structure */
3092  __kmp_init_lock(&root->r.r_begin_lock);
3093  root->r.r_begin = FALSE;
3094  root->r.r_active = FALSE;
3095  root->r.r_in_parallel = 0;
3096  root->r.r_blocktime = __kmp_dflt_blocktime;
3097 
3098  /* setup the root team for this task */
3099  /* allocate the root team structure */
3100  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3101 
3102  root_team =
3103  __kmp_allocate_team(root,
3104  1, // new_nproc
3105  1, // max_nproc
3106 #if OMPT_SUPPORT
3107  ompt_data_none, // root parallel id
3108 #endif
3109  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3110  0 // argc
3111  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3112  );
3113 #if USE_DEBUGGER
3114  // Non-NULL value should be assigned to make the debugger display the root
3115  // team.
3116  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3117 #endif
3118 
3119  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3120 
3121  root->r.r_root_team = root_team;
3122  root_team->t.t_control_stack_top = NULL;
3123 
3124  /* initialize root team */
3125  root_team->t.t_threads[0] = NULL;
3126  root_team->t.t_nproc = 1;
3127  root_team->t.t_serialized = 1;
3128  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3129  root_team->t.t_sched.sched = r_sched.sched;
3130  KA_TRACE(
3131  20,
3132  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3133  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3134 
3135  /* setup the hot team for this task */
3136  /* allocate the hot team structure */
3137  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3138 
3139  hot_team =
3140  __kmp_allocate_team(root,
3141  1, // new_nproc
3142  __kmp_dflt_team_nth_ub * 2, // max_nproc
3143 #if OMPT_SUPPORT
3144  ompt_data_none, // root parallel id
3145 #endif
3146  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3147  0 // argc
3148  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3149  );
3150  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3151 
3152  root->r.r_hot_team = hot_team;
3153  root_team->t.t_control_stack_top = NULL;
3154 
3155  /* first-time initialization */
3156  hot_team->t.t_parent = root_team;
3157 
3158  /* initialize hot team */
3159  hot_team_max_nth = hot_team->t.t_max_nproc;
3160  for (f = 0; f < hot_team_max_nth; ++f) {
3161  hot_team->t.t_threads[f] = NULL;
3162  }
3163  hot_team->t.t_nproc = 1;
3164  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3165  hot_team->t.t_sched.sched = r_sched.sched;
3166  hot_team->t.t_size_changed = 0;
3167 }
3168 
3169 #ifdef KMP_DEBUG
3170 
3171 typedef struct kmp_team_list_item {
3172  kmp_team_p const *entry;
3173  struct kmp_team_list_item *next;
3174 } kmp_team_list_item_t;
3175 typedef kmp_team_list_item_t *kmp_team_list_t;
3176 
3177 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3178  kmp_team_list_t list, // List of teams.
3179  kmp_team_p const *team // Team to add.
3180  ) {
3181 
3182  // List must terminate with item where both entry and next are NULL.
3183  // Team is added to the list only once.
3184  // List is sorted in ascending order by team id.
3185  // Team id is *not* a key.
3186 
3187  kmp_team_list_t l;
3188 
3189  KMP_DEBUG_ASSERT(list != NULL);
3190  if (team == NULL) {
3191  return;
3192  }
3193 
3194  __kmp_print_structure_team_accum(list, team->t.t_parent);
3195  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3196 
3197  // Search list for the team.
3198  l = list;
3199  while (l->next != NULL && l->entry != team) {
3200  l = l->next;
3201  }
3202  if (l->next != NULL) {
3203  return; // Team has been added before, exit.
3204  }
3205 
3206  // Team is not found. Search list again for insertion point.
3207  l = list;
3208  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3209  l = l->next;
3210  }
3211 
3212  // Insert team.
3213  {
3214  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3215  sizeof(kmp_team_list_item_t));
3216  *item = *l;
3217  l->entry = team;
3218  l->next = item;
3219  }
3220 }
3221 
3222 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3223 
3224  ) {
3225  __kmp_printf("%s", title);
3226  if (team != NULL) {
3227  __kmp_printf("%2x %p\n", team->t.t_id, team);
3228  } else {
3229  __kmp_printf(" - (nil)\n");
3230  }
3231 }
3232 
3233 static void __kmp_print_structure_thread(char const *title,
3234  kmp_info_p const *thread) {
3235  __kmp_printf("%s", title);
3236  if (thread != NULL) {
3237  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3238  } else {
3239  __kmp_printf(" - (nil)\n");
3240  }
3241 }
3242 
3243 void __kmp_print_structure(void) {
3244 
3245  kmp_team_list_t list;
3246 
3247  // Initialize list of teams.
3248  list =
3249  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3250  list->entry = NULL;
3251  list->next = NULL;
3252 
3253  __kmp_printf("\n------------------------------\nGlobal Thread "
3254  "Table\n------------------------------\n");
3255  {
3256  int gtid;
3257  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3258  __kmp_printf("%2d", gtid);
3259  if (__kmp_threads != NULL) {
3260  __kmp_printf(" %p", __kmp_threads[gtid]);
3261  }
3262  if (__kmp_root != NULL) {
3263  __kmp_printf(" %p", __kmp_root[gtid]);
3264  }
3265  __kmp_printf("\n");
3266  }
3267  }
3268 
3269  // Print out __kmp_threads array.
3270  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3271  "----------\n");
3272  if (__kmp_threads != NULL) {
3273  int gtid;
3274  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3275  kmp_info_t const *thread = __kmp_threads[gtid];
3276  if (thread != NULL) {
3277  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3278  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3279  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3280  __kmp_print_structure_team(" Serial Team: ",
3281  thread->th.th_serial_team);
3282  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3283  __kmp_print_structure_thread(" Master: ",
3284  thread->th.th_team_master);
3285  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3286  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3287  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3288  __kmp_print_structure_thread(" Next in pool: ",
3289  thread->th.th_next_pool);
3290  __kmp_printf("\n");
3291  __kmp_print_structure_team_accum(list, thread->th.th_team);
3292  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3293  }
3294  }
3295  } else {
3296  __kmp_printf("Threads array is not allocated.\n");
3297  }
3298 
3299  // Print out __kmp_root array.
3300  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3301  "--------\n");
3302  if (__kmp_root != NULL) {
3303  int gtid;
3304  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3305  kmp_root_t const *root = __kmp_root[gtid];
3306  if (root != NULL) {
3307  __kmp_printf("GTID %2d %p:\n", gtid, root);
3308  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3309  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3310  __kmp_print_structure_thread(" Uber Thread: ",
3311  root->r.r_uber_thread);
3312  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3313  __kmp_printf(" In Parallel: %2d\n",
3314  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3315  __kmp_printf("\n");
3316  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3317  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3318  }
3319  }
3320  } else {
3321  __kmp_printf("Ubers array is not allocated.\n");
3322  }
3323 
3324  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3325  "--------\n");
3326  while (list->next != NULL) {
3327  kmp_team_p const *team = list->entry;
3328  int i;
3329  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3330  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3331  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3332  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3333  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3334  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3335  for (i = 0; i < team->t.t_nproc; ++i) {
3336  __kmp_printf(" Thread %2d: ", i);
3337  __kmp_print_structure_thread("", team->t.t_threads[i]);
3338  }
3339  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3340  __kmp_printf("\n");
3341  list = list->next;
3342  }
3343 
3344  // Print out __kmp_thread_pool and __kmp_team_pool.
3345  __kmp_printf("\n------------------------------\nPools\n----------------------"
3346  "--------\n");
3347  __kmp_print_structure_thread("Thread pool: ",
3348  CCAST(kmp_info_t *, __kmp_thread_pool));
3349  __kmp_print_structure_team("Team pool: ",
3350  CCAST(kmp_team_t *, __kmp_team_pool));
3351  __kmp_printf("\n");
3352 
3353  // Free team list.
3354  while (list != NULL) {
3355  kmp_team_list_item_t *item = list;
3356  list = list->next;
3357  KMP_INTERNAL_FREE(item);
3358  }
3359 }
3360 
3361 #endif
3362 
3363 //---------------------------------------------------------------------------
3364 // Stuff for per-thread fast random number generator
3365 // Table of primes
3366 static const unsigned __kmp_primes[] = {
3367  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3368  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3369  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3370  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3371  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3372  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3373  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3374  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3375  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3376  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3377  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3378 
3379 //---------------------------------------------------------------------------
3380 // __kmp_get_random: Get a random number using a linear congruential method.
3381 unsigned short __kmp_get_random(kmp_info_t *thread) {
3382  unsigned x = thread->th.th_x;
3383  unsigned short r = x >> 16;
3384 
3385  thread->th.th_x = x * thread->th.th_a + 1;
3386 
3387  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3388  thread->th.th_info.ds.ds_tid, r));
3389 
3390  return r;
3391 }
3392 //--------------------------------------------------------
3393 // __kmp_init_random: Initialize a random number generator
3394 void __kmp_init_random(kmp_info_t *thread) {
3395  unsigned seed = thread->th.th_info.ds.ds_tid;
3396 
3397  thread->th.th_a =
3398  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3399  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3400  KA_TRACE(30,
3401  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3402 }
3403 
3404 #if KMP_OS_WINDOWS
3405 /* reclaim array entries for root threads that are already dead, returns number
3406  * reclaimed */
3407 static int __kmp_reclaim_dead_roots(void) {
3408  int i, r = 0;
3409 
3410  for (i = 0; i < __kmp_threads_capacity; ++i) {
3411  if (KMP_UBER_GTID(i) &&
3412  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3413  !__kmp_root[i]
3414  ->r.r_active) { // AC: reclaim only roots died in non-active state
3415  r += __kmp_unregister_root_other_thread(i);
3416  }
3417  }
3418  return r;
3419 }
3420 #endif
3421 
3422 /* This function attempts to create free entries in __kmp_threads and
3423  __kmp_root, and returns the number of free entries generated.
3424 
3425  For Windows* OS static library, the first mechanism used is to reclaim array
3426  entries for root threads that are already dead.
3427 
3428  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3429  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3430  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3431  threadprivate cache array has been created. Synchronization with
3432  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3433 
3434  After any dead root reclamation, if the clipping value allows array expansion
3435  to result in the generation of a total of nNeed free slots, the function does
3436  that expansion. If not, nothing is done beyond the possible initial root
3437  thread reclamation.
3438 
3439  If any argument is negative, the behavior is undefined. */
3440 static int __kmp_expand_threads(int nNeed) {
3441  int added = 0;
3442  int minimumRequiredCapacity;
3443  int newCapacity;
3444  kmp_info_t **newThreads;
3445  kmp_root_t **newRoot;
3446 
3447 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3448 // resizing __kmp_threads does not need additional protection if foreign
3449 // threads are present
3450 
3451 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3452  /* only for Windows static library */
3453  /* reclaim array entries for root threads that are already dead */
3454  added = __kmp_reclaim_dead_roots();
3455 
3456  if (nNeed) {
3457  nNeed -= added;
3458  if (nNeed < 0)
3459  nNeed = 0;
3460  }
3461 #endif
3462  if (nNeed <= 0)
3463  return added;
3464 
3465  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3466  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3467  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3468  // > __kmp_max_nth in one of two ways:
3469  //
3470  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3471  // may not be resused by another thread, so we may need to increase
3472  // __kmp_threads_capacity to __kmp_max_nth + 1.
3473  //
3474  // 2) New foreign root(s) are encountered. We always register new foreign
3475  // roots. This may cause a smaller # of threads to be allocated at
3476  // subsequent parallel regions, but the worker threads hang around (and
3477  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3478  //
3479  // Anyway, that is the reason for moving the check to see if
3480  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3481  // instead of having it performed here. -BB
3482 
3483  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3484 
3485  /* compute expansion headroom to check if we can expand */
3486  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3487  /* possible expansion too small -- give up */
3488  return added;
3489  }
3490  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3491 
3492  newCapacity = __kmp_threads_capacity;
3493  do {
3494  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3495  : __kmp_sys_max_nth;
3496  } while (newCapacity < minimumRequiredCapacity);
3497  newThreads = (kmp_info_t **)__kmp_allocate(
3498  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3499  newRoot =
3500  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3501  KMP_MEMCPY(newThreads, __kmp_threads,
3502  __kmp_threads_capacity * sizeof(kmp_info_t *));
3503  KMP_MEMCPY(newRoot, __kmp_root,
3504  __kmp_threads_capacity * sizeof(kmp_root_t *));
3505 
3506  kmp_info_t **temp_threads = __kmp_threads;
3507  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3508  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3509  __kmp_free(temp_threads);
3510  added += newCapacity - __kmp_threads_capacity;
3511  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3512 
3513  if (newCapacity > __kmp_tp_capacity) {
3514  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3515  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3516  __kmp_threadprivate_resize_cache(newCapacity);
3517  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3518  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3519  }
3520  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3521  }
3522 
3523  return added;
3524 }
3525 
3526 /* Register the current thread as a root thread and obtain our gtid. We must
3527  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3528  thread that calls from __kmp_do_serial_initialize() */
3529 int __kmp_register_root(int initial_thread) {
3530  kmp_info_t *root_thread;
3531  kmp_root_t *root;
3532  int gtid;
3533  int capacity;
3534  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3535  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3536  KMP_MB();
3537 
3538  /* 2007-03-02:
3539  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3540  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3541  work as expected -- it may return false (that means there is at least one
3542  empty slot in __kmp_threads array), but it is possible the only free slot
3543  is #0, which is reserved for initial thread and so cannot be used for this
3544  one. Following code workarounds this bug.
3545 
3546  However, right solution seems to be not reserving slot #0 for initial
3547  thread because:
3548  (1) there is no magic in slot #0,
3549  (2) we cannot detect initial thread reliably (the first thread which does
3550  serial initialization may be not a real initial thread).
3551  */
3552  capacity = __kmp_threads_capacity;
3553  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3554  --capacity;
3555  }
3556 
3557  /* see if there are too many threads */
3558  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3559  if (__kmp_tp_cached) {
3560  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3561  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3562  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3563  } else {
3564  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3565  __kmp_msg_null);
3566  }
3567  }
3568 
3569  /* find an available thread slot */
3570  /* Don't reassign the zero slot since we need that to only be used by initial
3571  thread */
3572  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3573  gtid++)
3574  ;
3575  KA_TRACE(1,
3576  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3577  KMP_ASSERT(gtid < __kmp_threads_capacity);
3578 
3579  /* update global accounting */
3580  __kmp_all_nth++;
3581  TCW_4(__kmp_nth, __kmp_nth + 1);
3582 
3583  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3584  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3585  if (__kmp_adjust_gtid_mode) {
3586  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3587  if (TCR_4(__kmp_gtid_mode) != 2) {
3588  TCW_4(__kmp_gtid_mode, 2);
3589  }
3590  } else {
3591  if (TCR_4(__kmp_gtid_mode) != 1) {
3592  TCW_4(__kmp_gtid_mode, 1);
3593  }
3594  }
3595  }
3596 
3597 #ifdef KMP_ADJUST_BLOCKTIME
3598  /* Adjust blocktime to zero if necessary */
3599  /* Middle initialization might not have occurred yet */
3600  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3601  if (__kmp_nth > __kmp_avail_proc) {
3602  __kmp_zero_bt = TRUE;
3603  }
3604  }
3605 #endif /* KMP_ADJUST_BLOCKTIME */
3606 
3607  /* setup this new hierarchy */
3608  if (!(root = __kmp_root[gtid])) {
3609  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3610  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3611  }
3612 
3613 #if KMP_STATS_ENABLED
3614  // Initialize stats as soon as possible (right after gtid assignment).
3615  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3616  __kmp_stats_thread_ptr->startLife();
3617  KMP_SET_THREAD_STATE(SERIAL_REGION);
3618  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3619 #endif
3620  __kmp_initialize_root(root);
3621 
3622  /* setup new root thread structure */
3623  if (root->r.r_uber_thread) {
3624  root_thread = root->r.r_uber_thread;
3625  } else {
3626  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3627  if (__kmp_storage_map) {
3628  __kmp_print_thread_storage_map(root_thread, gtid);
3629  }
3630  root_thread->th.th_info.ds.ds_gtid = gtid;
3631 #if OMPT_SUPPORT
3632  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3633 #endif
3634  root_thread->th.th_root = root;
3635  if (__kmp_env_consistency_check) {
3636  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3637  }
3638 #if USE_FAST_MEMORY
3639  __kmp_initialize_fast_memory(root_thread);
3640 #endif /* USE_FAST_MEMORY */
3641 
3642 #if KMP_USE_BGET
3643  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3644  __kmp_initialize_bget(root_thread);
3645 #endif
3646  __kmp_init_random(root_thread); // Initialize random number generator
3647  }
3648 
3649  /* setup the serial team held in reserve by the root thread */
3650  if (!root_thread->th.th_serial_team) {
3651  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3652  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3653  root_thread->th.th_serial_team = __kmp_allocate_team(
3654  root, 1, 1,
3655 #if OMPT_SUPPORT
3656  ompt_data_none, // root parallel id
3657 #endif
3658  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3659  }
3660  KMP_ASSERT(root_thread->th.th_serial_team);
3661  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3662  root_thread->th.th_serial_team));
3663 
3664  /* drop root_thread into place */
3665  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3666 
3667  root->r.r_root_team->t.t_threads[0] = root_thread;
3668  root->r.r_hot_team->t.t_threads[0] = root_thread;
3669  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3670  // AC: the team created in reserve, not for execution (it is unused for now).
3671  root_thread->th.th_serial_team->t.t_serialized = 0;
3672  root->r.r_uber_thread = root_thread;
3673 
3674  /* initialize the thread, get it ready to go */
3675  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3676  TCW_4(__kmp_init_gtid, TRUE);
3677 
3678  /* prepare the master thread for get_gtid() */
3679  __kmp_gtid_set_specific(gtid);
3680 
3681 #if USE_ITT_BUILD
3682  __kmp_itt_thread_name(gtid);
3683 #endif /* USE_ITT_BUILD */
3684 
3685 #ifdef KMP_TDATA_GTID
3686  __kmp_gtid = gtid;
3687 #endif
3688  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3689  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3690 
3691  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3692  "plain=%u\n",
3693  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3694  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3695  KMP_INIT_BARRIER_STATE));
3696  { // Initialize barrier data.
3697  int b;
3698  for (b = 0; b < bs_last_barrier; ++b) {
3699  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3700 #if USE_DEBUGGER
3701  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3702 #endif
3703  }
3704  }
3705  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3706  KMP_INIT_BARRIER_STATE);
3707 
3708 #if KMP_AFFINITY_SUPPORTED
3709  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3710  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3711  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3712  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3713  if (TCR_4(__kmp_init_middle)) {
3714  __kmp_affinity_set_init_mask(gtid, TRUE);
3715  }
3716 #endif /* KMP_AFFINITY_SUPPORTED */
3717  root_thread->th.th_def_allocator = __kmp_def_allocator;
3718  root_thread->th.th_prev_level = 0;
3719  root_thread->th.th_prev_num_threads = 1;
3720 
3721  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3722  tmp->cg_root = root_thread;
3723  tmp->cg_thread_limit = __kmp_cg_max_nth;
3724  tmp->cg_nthreads = 1;
3725  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3726  " cg_nthreads init to 1\n",
3727  root_thread, tmp));
3728  tmp->up = NULL;
3729  root_thread->th.th_cg_roots = tmp;
3730 
3731  __kmp_root_counter++;
3732 
3733 #if OMPT_SUPPORT
3734  if (!initial_thread && ompt_enabled.enabled) {
3735 
3736  kmp_info_t *root_thread = ompt_get_thread();
3737 
3738  ompt_set_thread_state(root_thread, ompt_state_overhead);
3739 
3740  if (ompt_enabled.ompt_callback_thread_begin) {
3741  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3742  ompt_thread_initial, __ompt_get_thread_data_internal());
3743  }
3744  ompt_data_t *task_data;
3745  ompt_data_t *parallel_data;
3746  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3747  if (ompt_enabled.ompt_callback_implicit_task) {
3748  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3749  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3750  }
3751 
3752  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3753  }
3754 #endif
3755 
3756  KMP_MB();
3757  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3758 
3759  return gtid;
3760 }
3761 
3762 #if KMP_NESTED_HOT_TEAMS
3763 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3764  const int max_level) {
3765  int i, n, nth;
3766  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3767  if (!hot_teams || !hot_teams[level].hot_team) {
3768  return 0;
3769  }
3770  KMP_DEBUG_ASSERT(level < max_level);
3771  kmp_team_t *team = hot_teams[level].hot_team;
3772  nth = hot_teams[level].hot_team_nth;
3773  n = nth - 1; // master is not freed
3774  if (level < max_level - 1) {
3775  for (i = 0; i < nth; ++i) {
3776  kmp_info_t *th = team->t.t_threads[i];
3777  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3778  if (i > 0 && th->th.th_hot_teams) {
3779  __kmp_free(th->th.th_hot_teams);
3780  th->th.th_hot_teams = NULL;
3781  }
3782  }
3783  }
3784  __kmp_free_team(root, team, NULL);
3785  return n;
3786 }
3787 #endif
3788 
3789 // Resets a root thread and clear its root and hot teams.
3790 // Returns the number of __kmp_threads entries directly and indirectly freed.
3791 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3792  kmp_team_t *root_team = root->r.r_root_team;
3793  kmp_team_t *hot_team = root->r.r_hot_team;
3794  int n = hot_team->t.t_nproc;
3795  int i;
3796 
3797  KMP_DEBUG_ASSERT(!root->r.r_active);
3798 
3799  root->r.r_root_team = NULL;
3800  root->r.r_hot_team = NULL;
3801  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3802  // before call to __kmp_free_team().
3803  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3804 #if KMP_NESTED_HOT_TEAMS
3805  if (__kmp_hot_teams_max_level >
3806  0) { // need to free nested hot teams and their threads if any
3807  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3808  kmp_info_t *th = hot_team->t.t_threads[i];
3809  if (__kmp_hot_teams_max_level > 1) {
3810  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3811  }
3812  if (th->th.th_hot_teams) {
3813  __kmp_free(th->th.th_hot_teams);
3814  th->th.th_hot_teams = NULL;
3815  }
3816  }
3817  }
3818 #endif
3819  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3820 
3821  // Before we can reap the thread, we need to make certain that all other
3822  // threads in the teams that had this root as ancestor have stopped trying to
3823  // steal tasks.
3824  if (__kmp_tasking_mode != tskm_immediate_exec) {
3825  __kmp_wait_to_unref_task_teams();
3826  }
3827 
3828 #if KMP_OS_WINDOWS
3829  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3830  KA_TRACE(
3831  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3832  "\n",
3833  (LPVOID) & (root->r.r_uber_thread->th),
3834  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3835  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3836 #endif /* KMP_OS_WINDOWS */
3837 
3838 #if OMPT_SUPPORT
3839  ompt_data_t *task_data;
3840  ompt_data_t *parallel_data;
3841  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3842  if (ompt_enabled.ompt_callback_implicit_task) {
3843  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3844  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3845  }
3846  if (ompt_enabled.ompt_callback_thread_end) {
3847  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3848  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3849  }
3850 #endif
3851 
3852  TCW_4(__kmp_nth,
3853  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3854  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3855  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3856  " to %d\n",
3857  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3858  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3859  if (i == 1) {
3860  // need to free contention group structure
3861  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3862  root->r.r_uber_thread->th.th_cg_roots->cg_root);
3863  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3864  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3865  root->r.r_uber_thread->th.th_cg_roots = NULL;
3866  }
3867  __kmp_reap_thread(root->r.r_uber_thread, 1);
3868 
3869  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3870  // of freeing.
3871  root->r.r_uber_thread = NULL;
3872  /* mark root as no longer in use */
3873  root->r.r_begin = FALSE;
3874 
3875  return n;
3876 }
3877 
3878 void __kmp_unregister_root_current_thread(int gtid) {
3879  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3880  /* this lock should be ok, since unregister_root_current_thread is never
3881  called during an abort, only during a normal close. furthermore, if you
3882  have the forkjoin lock, you should never try to get the initz lock */
3883  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3884  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3885  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3886  "exiting T#%d\n",
3887  gtid));
3888  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3889  return;
3890  }
3891  kmp_root_t *root = __kmp_root[gtid];
3892 
3893  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3894  KMP_ASSERT(KMP_UBER_GTID(gtid));
3895  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3896  KMP_ASSERT(root->r.r_active == FALSE);
3897 
3898  KMP_MB();
3899 
3900  kmp_info_t *thread = __kmp_threads[gtid];
3901  kmp_team_t *team = thread->th.th_team;
3902  kmp_task_team_t *task_team = thread->th.th_task_team;
3903 
3904  // we need to wait for the proxy tasks before finishing the thread
3905  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3906 #if OMPT_SUPPORT
3907  // the runtime is shutting down so we won't report any events
3908  thread->th.ompt_thread_info.state = ompt_state_undefined;
3909 #endif
3910  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3911  }
3912 
3913  __kmp_reset_root(gtid, root);
3914 
3915  /* free up this thread slot */
3916  __kmp_gtid_set_specific(KMP_GTID_DNE);
3917 #ifdef KMP_TDATA_GTID
3918  __kmp_gtid = KMP_GTID_DNE;
3919 #endif
3920 
3921  KMP_MB();
3922  KC_TRACE(10,
3923  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3924 
3925  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3926 }
3927 
3928 #if KMP_OS_WINDOWS
3929 /* __kmp_forkjoin_lock must be already held
3930  Unregisters a root thread that is not the current thread. Returns the number
3931  of __kmp_threads entries freed as a result. */
3932 static int __kmp_unregister_root_other_thread(int gtid) {
3933  kmp_root_t *root = __kmp_root[gtid];
3934  int r;
3935 
3936  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3937  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3938  KMP_ASSERT(KMP_UBER_GTID(gtid));
3939  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3940  KMP_ASSERT(root->r.r_active == FALSE);
3941 
3942  r = __kmp_reset_root(gtid, root);
3943  KC_TRACE(10,
3944  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3945  return r;
3946 }
3947 #endif
3948 
3949 #if KMP_DEBUG
3950 void __kmp_task_info() {
3951 
3952  kmp_int32 gtid = __kmp_entry_gtid();
3953  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
3954  kmp_info_t *this_thr = __kmp_threads[gtid];
3955  kmp_team_t *steam = this_thr->th.th_serial_team;
3956  kmp_team_t *team = this_thr->th.th_team;
3957 
3958  __kmp_printf(
3959  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
3960  "ptask=%p\n",
3961  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
3962  team->t.t_implicit_task_taskdata[tid].td_parent);
3963 }
3964 #endif // KMP_DEBUG
3965 
3966 /* TODO optimize with one big memclr, take out what isn't needed, split
3967  responsibility to workers as much as possible, and delay initialization of
3968  features as much as possible */
3969 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
3970  int tid, int gtid) {
3971  /* this_thr->th.th_info.ds.ds_gtid is setup in
3972  kmp_allocate_thread/create_worker.
3973  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3974  kmp_info_t *master = team->t.t_threads[0];
3975  KMP_DEBUG_ASSERT(this_thr != NULL);
3976  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
3977  KMP_DEBUG_ASSERT(team);
3978  KMP_DEBUG_ASSERT(team->t.t_threads);
3979  KMP_DEBUG_ASSERT(team->t.t_dispatch);
3980  KMP_DEBUG_ASSERT(master);
3981  KMP_DEBUG_ASSERT(master->th.th_root);
3982 
3983  KMP_MB();
3984 
3985  TCW_SYNC_PTR(this_thr->th.th_team, team);
3986 
3987  this_thr->th.th_info.ds.ds_tid = tid;
3988  this_thr->th.th_set_nproc = 0;
3989  if (__kmp_tasking_mode != tskm_immediate_exec)
3990  // When tasking is possible, threads are not safe to reap until they are
3991  // done tasking; this will be set when tasking code is exited in wait
3992  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3993  else // no tasking --> always safe to reap
3994  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
3995  this_thr->th.th_set_proc_bind = proc_bind_default;
3996 #if KMP_AFFINITY_SUPPORTED
3997  this_thr->th.th_new_place = this_thr->th.th_current_place;
3998 #endif
3999  this_thr->th.th_root = master->th.th_root;
4000 
4001  /* setup the thread's cache of the team structure */
4002  this_thr->th.th_team_nproc = team->t.t_nproc;
4003  this_thr->th.th_team_master = master;
4004  this_thr->th.th_team_serialized = team->t.t_serialized;
4005  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4006 
4007  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4008 
4009  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4010  tid, gtid, this_thr, this_thr->th.th_current_task));
4011 
4012  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4013  team, tid, TRUE);
4014 
4015  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4016  tid, gtid, this_thr, this_thr->th.th_current_task));
4017  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4018  // __kmp_initialize_team()?
4019 
4020  /* TODO no worksharing in speculative threads */
4021  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4022 
4023  this_thr->th.th_local.this_construct = 0;
4024 
4025  if (!this_thr->th.th_pri_common) {
4026  this_thr->th.th_pri_common =
4027  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4028  if (__kmp_storage_map) {
4029  __kmp_print_storage_map_gtid(
4030  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4031  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4032  }
4033  this_thr->th.th_pri_head = NULL;
4034  }
4035 
4036  if (this_thr != master && // Master's CG root is initialized elsewhere
4037  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4038  // Make new thread's CG root same as master's
4039  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4040  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4041  if (tmp) {
4042  // worker changes CG, need to check if old CG should be freed
4043  int i = tmp->cg_nthreads--;
4044  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4045  " on node %p of thread %p to %d\n",
4046  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4047  if (i == 1) {
4048  __kmp_free(tmp); // last thread left CG --> free it
4049  }
4050  }
4051  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4052  // Increment new thread's CG root's counter to add the new thread
4053  this_thr->th.th_cg_roots->cg_nthreads++;
4054  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4055  " node %p of thread %p to %d\n",
4056  this_thr, this_thr->th.th_cg_roots,
4057  this_thr->th.th_cg_roots->cg_root,
4058  this_thr->th.th_cg_roots->cg_nthreads));
4059  this_thr->th.th_current_task->td_icvs.thread_limit =
4060  this_thr->th.th_cg_roots->cg_thread_limit;
4061  }
4062 
4063  /* Initialize dynamic dispatch */
4064  {
4065  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4066  // Use team max_nproc since this will never change for the team.
4067  size_t disp_size =
4068  sizeof(dispatch_private_info_t) *
4069  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4070  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4071  team->t.t_max_nproc));
4072  KMP_ASSERT(dispatch);
4073  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4074  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4075 
4076  dispatch->th_disp_index = 0;
4077  dispatch->th_doacross_buf_idx = 0;
4078  if (!dispatch->th_disp_buffer) {
4079  dispatch->th_disp_buffer =
4080  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4081 
4082  if (__kmp_storage_map) {
4083  __kmp_print_storage_map_gtid(
4084  gtid, &dispatch->th_disp_buffer[0],
4085  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4086  ? 1
4087  : __kmp_dispatch_num_buffers],
4088  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4089  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4090  gtid, team->t.t_id, gtid);
4091  }
4092  } else {
4093  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4094  }
4095 
4096  dispatch->th_dispatch_pr_current = 0;
4097  dispatch->th_dispatch_sh_current = 0;
4098 
4099  dispatch->th_deo_fcn = 0; /* ORDERED */
4100  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4101  }
4102 
4103  this_thr->th.th_next_pool = NULL;
4104 
4105  if (!this_thr->th.th_task_state_memo_stack) {
4106  size_t i;
4107  this_thr->th.th_task_state_memo_stack =
4108  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4109  this_thr->th.th_task_state_top = 0;
4110  this_thr->th.th_task_state_stack_sz = 4;
4111  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4112  ++i) // zero init the stack
4113  this_thr->th.th_task_state_memo_stack[i] = 0;
4114  }
4115 
4116  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4117  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4118 
4119  KMP_MB();
4120 }
4121 
4122 /* allocate a new thread for the requesting team. this is only called from
4123  within a forkjoin critical section. we will first try to get an available
4124  thread from the thread pool. if none is available, we will fork a new one
4125  assuming we are able to create a new one. this should be assured, as the
4126  caller should check on this first. */
4127 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4128  int new_tid) {
4129  kmp_team_t *serial_team;
4130  kmp_info_t *new_thr;
4131  int new_gtid;
4132 
4133  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4134  KMP_DEBUG_ASSERT(root && team);
4135 #if !KMP_NESTED_HOT_TEAMS
4136  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4137 #endif
4138  KMP_MB();
4139 
4140  /* first, try to get one from the thread pool */
4141  if (__kmp_thread_pool) {
4142  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4143  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4144  if (new_thr == __kmp_thread_pool_insert_pt) {
4145  __kmp_thread_pool_insert_pt = NULL;
4146  }
4147  TCW_4(new_thr->th.th_in_pool, FALSE);
4148  __kmp_suspend_initialize_thread(new_thr);
4149  __kmp_lock_suspend_mx(new_thr);
4150  if (new_thr->th.th_active_in_pool == TRUE) {
4151  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4152  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4153  new_thr->th.th_active_in_pool = FALSE;
4154  }
4155  __kmp_unlock_suspend_mx(new_thr);
4156 
4157  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4158  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4159  KMP_ASSERT(!new_thr->th.th_team);
4160  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4161 
4162  /* setup the thread structure */
4163  __kmp_initialize_info(new_thr, team, new_tid,
4164  new_thr->th.th_info.ds.ds_gtid);
4165  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4166 
4167  TCW_4(__kmp_nth, __kmp_nth + 1);
4168 
4169  new_thr->th.th_task_state = 0;
4170  new_thr->th.th_task_state_top = 0;
4171  new_thr->th.th_task_state_stack_sz = 4;
4172 
4173 #ifdef KMP_ADJUST_BLOCKTIME
4174  /* Adjust blocktime back to zero if necessary */
4175  /* Middle initialization might not have occurred yet */
4176  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4177  if (__kmp_nth > __kmp_avail_proc) {
4178  __kmp_zero_bt = TRUE;
4179  }
4180  }
4181 #endif /* KMP_ADJUST_BLOCKTIME */
4182 
4183 #if KMP_DEBUG
4184  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4185  // KMP_BARRIER_PARENT_FLAG.
4186  int b;
4187  kmp_balign_t *balign = new_thr->th.th_bar;
4188  for (b = 0; b < bs_last_barrier; ++b)
4189  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4190 #endif
4191 
4192  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4193  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4194 
4195  KMP_MB();
4196  return new_thr;
4197  }
4198 
4199  /* no, well fork a new one */
4200  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4201  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4202 
4203 #if KMP_USE_MONITOR
4204  // If this is the first worker thread the RTL is creating, then also
4205  // launch the monitor thread. We try to do this as early as possible.
4206  if (!TCR_4(__kmp_init_monitor)) {
4207  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4208  if (!TCR_4(__kmp_init_monitor)) {
4209  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4210  TCW_4(__kmp_init_monitor, 1);
4211  __kmp_create_monitor(&__kmp_monitor);
4212  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4213 #if KMP_OS_WINDOWS
4214  // AC: wait until monitor has started. This is a fix for CQ232808.
4215  // The reason is that if the library is loaded/unloaded in a loop with
4216  // small (parallel) work in between, then there is high probability that
4217  // monitor thread started after the library shutdown. At shutdown it is
4218  // too late to cope with the problem, because when the master is in
4219  // DllMain (process detach) the monitor has no chances to start (it is
4220  // blocked), and master has no means to inform the monitor that the
4221  // library has gone, because all the memory which the monitor can access
4222  // is going to be released/reset.
4223  while (TCR_4(__kmp_init_monitor) < 2) {
4224  KMP_YIELD(TRUE);
4225  }
4226  KF_TRACE(10, ("after monitor thread has started\n"));
4227 #endif
4228  }
4229  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4230  }
4231 #endif
4232 
4233  KMP_MB();
4234  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4235  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4236  }
4237 
4238  /* allocate space for it. */
4239  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4240 
4241  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4242 
4243  if (__kmp_storage_map) {
4244  __kmp_print_thread_storage_map(new_thr, new_gtid);
4245  }
4246 
4247  // add the reserve serialized team, initialized from the team's master thread
4248  {
4249  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4250  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4251  new_thr->th.th_serial_team = serial_team =
4252  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4253 #if OMPT_SUPPORT
4254  ompt_data_none, // root parallel id
4255 #endif
4256  proc_bind_default, &r_icvs,
4257  0 USE_NESTED_HOT_ARG(NULL));
4258  }
4259  KMP_ASSERT(serial_team);
4260  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4261  // execution (it is unused for now).
4262  serial_team->t.t_threads[0] = new_thr;
4263  KF_TRACE(10,
4264  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4265  new_thr));
4266 
4267  /* setup the thread structures */
4268  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4269 
4270 #if USE_FAST_MEMORY
4271  __kmp_initialize_fast_memory(new_thr);
4272 #endif /* USE_FAST_MEMORY */
4273 
4274 #if KMP_USE_BGET
4275  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4276  __kmp_initialize_bget(new_thr);
4277 #endif
4278 
4279  __kmp_init_random(new_thr); // Initialize random number generator
4280 
4281  /* Initialize these only once when thread is grabbed for a team allocation */
4282  KA_TRACE(20,
4283  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4284  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4285 
4286  int b;
4287  kmp_balign_t *balign = new_thr->th.th_bar;
4288  for (b = 0; b < bs_last_barrier; ++b) {
4289  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4290  balign[b].bb.team = NULL;
4291  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4292  balign[b].bb.use_oncore_barrier = 0;
4293  }
4294 
4295  new_thr->th.th_spin_here = FALSE;
4296  new_thr->th.th_next_waiting = 0;
4297 #if KMP_OS_UNIX
4298  new_thr->th.th_blocking = false;
4299 #endif
4300 
4301 #if KMP_AFFINITY_SUPPORTED
4302  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4303  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4304  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4305  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4306 #endif
4307  new_thr->th.th_def_allocator = __kmp_def_allocator;
4308  new_thr->th.th_prev_level = 0;
4309  new_thr->th.th_prev_num_threads = 1;
4310 
4311  TCW_4(new_thr->th.th_in_pool, FALSE);
4312  new_thr->th.th_active_in_pool = FALSE;
4313  TCW_4(new_thr->th.th_active, TRUE);
4314 
4315  /* adjust the global counters */
4316  __kmp_all_nth++;
4317  __kmp_nth++;
4318 
4319  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4320  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4321  if (__kmp_adjust_gtid_mode) {
4322  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4323  if (TCR_4(__kmp_gtid_mode) != 2) {
4324  TCW_4(__kmp_gtid_mode, 2);
4325  }
4326  } else {
4327  if (TCR_4(__kmp_gtid_mode) != 1) {
4328  TCW_4(__kmp_gtid_mode, 1);
4329  }
4330  }
4331  }
4332 
4333 #ifdef KMP_ADJUST_BLOCKTIME
4334  /* Adjust blocktime back to zero if necessary */
4335  /* Middle initialization might not have occurred yet */
4336  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4337  if (__kmp_nth > __kmp_avail_proc) {
4338  __kmp_zero_bt = TRUE;
4339  }
4340  }
4341 #endif /* KMP_ADJUST_BLOCKTIME */
4342 
4343  /* actually fork it and create the new worker thread */
4344  KF_TRACE(
4345  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4346  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4347  KF_TRACE(10,
4348  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4349 
4350  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4351  new_gtid));
4352  KMP_MB();
4353  return new_thr;
4354 }
4355 
4356 /* Reinitialize team for reuse.
4357  The hot team code calls this case at every fork barrier, so EPCC barrier
4358  test are extremely sensitive to changes in it, esp. writes to the team
4359  struct, which cause a cache invalidation in all threads.
4360  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4361 static void __kmp_reinitialize_team(kmp_team_t *team,
4362  kmp_internal_control_t *new_icvs,
4363  ident_t *loc) {
4364  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4365  team->t.t_threads[0], team));
4366  KMP_DEBUG_ASSERT(team && new_icvs);
4367  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4368  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4369 
4370  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4371  // Copy ICVs to the master thread's implicit taskdata
4372  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4373  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4374 
4375  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4376  team->t.t_threads[0], team));
4377 }
4378 
4379 /* Initialize the team data structure.
4380  This assumes the t_threads and t_max_nproc are already set.
4381  Also, we don't touch the arguments */
4382 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4383  kmp_internal_control_t *new_icvs,
4384  ident_t *loc) {
4385  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4386 
4387  /* verify */
4388  KMP_DEBUG_ASSERT(team);
4389  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4390  KMP_DEBUG_ASSERT(team->t.t_threads);
4391  KMP_MB();
4392 
4393  team->t.t_master_tid = 0; /* not needed */
4394  /* team->t.t_master_bar; not needed */
4395  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4396  team->t.t_nproc = new_nproc;
4397 
4398  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4399  team->t.t_next_pool = NULL;
4400  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4401  * up hot team */
4402 
4403  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4404  team->t.t_invoke = NULL; /* not needed */
4405 
4406  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4407  team->t.t_sched.sched = new_icvs->sched.sched;
4408 
4409 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4410  team->t.t_fp_control_saved = FALSE; /* not needed */
4411  team->t.t_x87_fpu_control_word = 0; /* not needed */
4412  team->t.t_mxcsr = 0; /* not needed */
4413 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4414 
4415  team->t.t_construct = 0;
4416 
4417  team->t.t_ordered.dt.t_value = 0;
4418  team->t.t_master_active = FALSE;
4419 
4420 #ifdef KMP_DEBUG
4421  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4422 #endif
4423 #if KMP_OS_WINDOWS
4424  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4425 #endif
4426 
4427  team->t.t_control_stack_top = NULL;
4428 
4429  __kmp_reinitialize_team(team, new_icvs, loc);
4430 
4431  KMP_MB();
4432  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4433 }
4434 
4435 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4436 /* Sets full mask for thread and returns old mask, no changes to structures. */
4437 static void
4438 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4439  if (KMP_AFFINITY_CAPABLE()) {
4440  int status;
4441  if (old_mask != NULL) {
4442  status = __kmp_get_system_affinity(old_mask, TRUE);
4443  int error = errno;
4444  if (status != 0) {
4445  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4446  __kmp_msg_null);
4447  }
4448  }
4449  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4450  }
4451 }
4452 #endif
4453 
4454 #if KMP_AFFINITY_SUPPORTED
4455 
4456 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4457 // It calculats the worker + master thread's partition based upon the parent
4458 // thread's partition, and binds each worker to a thread in their partition.
4459 // The master thread's partition should already include its current binding.
4460 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4461  // Copy the master thread's place partion to the team struct
4462  kmp_info_t *master_th = team->t.t_threads[0];
4463  KMP_DEBUG_ASSERT(master_th != NULL);
4464  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4465  int first_place = master_th->th.th_first_place;
4466  int last_place = master_th->th.th_last_place;
4467  int masters_place = master_th->th.th_current_place;
4468  team->t.t_first_place = first_place;
4469  team->t.t_last_place = last_place;
4470 
4471  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4472  "bound to place %d partition = [%d,%d]\n",
4473  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4474  team->t.t_id, masters_place, first_place, last_place));
4475 
4476  switch (proc_bind) {
4477 
4478  case proc_bind_default:
4479  // serial teams might have the proc_bind policy set to proc_bind_default. It
4480  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4481  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4482  break;
4483 
4484  case proc_bind_master: {
4485  int f;
4486  int n_th = team->t.t_nproc;
4487  for (f = 1; f < n_th; f++) {
4488  kmp_info_t *th = team->t.t_threads[f];
4489  KMP_DEBUG_ASSERT(th != NULL);
4490  th->th.th_first_place = first_place;
4491  th->th.th_last_place = last_place;
4492  th->th.th_new_place = masters_place;
4493  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4494  team->t.t_display_affinity != 1) {
4495  team->t.t_display_affinity = 1;
4496  }
4497 
4498  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4499  "partition = [%d,%d]\n",
4500  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4501  f, masters_place, first_place, last_place));
4502  }
4503  } break;
4504 
4505  case proc_bind_close: {
4506  int f;
4507  int n_th = team->t.t_nproc;
4508  int n_places;
4509  if (first_place <= last_place) {
4510  n_places = last_place - first_place + 1;
4511  } else {
4512  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4513  }
4514  if (n_th <= n_places) {
4515  int place = masters_place;
4516  for (f = 1; f < n_th; f++) {
4517  kmp_info_t *th = team->t.t_threads[f];
4518  KMP_DEBUG_ASSERT(th != NULL);
4519 
4520  if (place == last_place) {
4521  place = first_place;
4522  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4523  place = 0;
4524  } else {
4525  place++;
4526  }
4527  th->th.th_first_place = first_place;
4528  th->th.th_last_place = last_place;
4529  th->th.th_new_place = place;
4530  if (__kmp_display_affinity && place != th->th.th_current_place &&
4531  team->t.t_display_affinity != 1) {
4532  team->t.t_display_affinity = 1;
4533  }
4534 
4535  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4536  "partition = [%d,%d]\n",
4537  __kmp_gtid_from_thread(team->t.t_threads[f]),
4538  team->t.t_id, f, place, first_place, last_place));
4539  }
4540  } else {
4541  int S, rem, gap, s_count;
4542  S = n_th / n_places;
4543  s_count = 0;
4544  rem = n_th - (S * n_places);
4545  gap = rem > 0 ? n_places / rem : n_places;
4546  int place = masters_place;
4547  int gap_ct = gap;
4548  for (f = 0; f < n_th; f++) {
4549  kmp_info_t *th = team->t.t_threads[f];
4550  KMP_DEBUG_ASSERT(th != NULL);
4551 
4552  th->th.th_first_place = first_place;
4553  th->th.th_last_place = last_place;
4554  th->th.th_new_place = place;
4555  if (__kmp_display_affinity && place != th->th.th_current_place &&
4556  team->t.t_display_affinity != 1) {
4557  team->t.t_display_affinity = 1;
4558  }
4559  s_count++;
4560 
4561  if ((s_count == S) && rem && (gap_ct == gap)) {
4562  // do nothing, add an extra thread to place on next iteration
4563  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4564  // we added an extra thread to this place; move to next place
4565  if (place == last_place) {
4566  place = first_place;
4567  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4568  place = 0;
4569  } else {
4570  place++;
4571  }
4572  s_count = 0;
4573  gap_ct = 1;
4574  rem--;
4575  } else if (s_count == S) { // place full; don't add extra
4576  if (place == last_place) {
4577  place = first_place;
4578  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4579  place = 0;
4580  } else {
4581  place++;
4582  }
4583  gap_ct++;
4584  s_count = 0;
4585  }
4586 
4587  KA_TRACE(100,
4588  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4589  "partition = [%d,%d]\n",
4590  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4591  th->th.th_new_place, first_place, last_place));
4592  }
4593  KMP_DEBUG_ASSERT(place == masters_place);
4594  }
4595  } break;
4596 
4597  case proc_bind_spread: {
4598  int f;
4599  int n_th = team->t.t_nproc;
4600  int n_places;
4601  int thidx;
4602  if (first_place <= last_place) {
4603  n_places = last_place - first_place + 1;
4604  } else {
4605  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4606  }
4607  if (n_th <= n_places) {
4608  int place = -1;
4609 
4610  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4611  int S = n_places / n_th;
4612  int s_count, rem, gap, gap_ct;
4613 
4614  place = masters_place;
4615  rem = n_places - n_th * S;
4616  gap = rem ? n_th / rem : 1;
4617  gap_ct = gap;
4618  thidx = n_th;
4619  if (update_master_only == 1)
4620  thidx = 1;
4621  for (f = 0; f < thidx; f++) {
4622  kmp_info_t *th = team->t.t_threads[f];
4623  KMP_DEBUG_ASSERT(th != NULL);
4624 
4625  th->th.th_first_place = place;
4626  th->th.th_new_place = place;
4627  if (__kmp_display_affinity && place != th->th.th_current_place &&
4628  team->t.t_display_affinity != 1) {
4629  team->t.t_display_affinity = 1;
4630  }
4631  s_count = 1;
4632  while (s_count < S) {
4633  if (place == last_place) {
4634  place = first_place;
4635  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4636  place = 0;
4637  } else {
4638  place++;
4639  }
4640  s_count++;
4641  }
4642  if (rem && (gap_ct == gap)) {
4643  if (place == last_place) {
4644  place = first_place;
4645  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4646  place = 0;
4647  } else {
4648  place++;
4649  }
4650  rem--;
4651  gap_ct = 0;
4652  }
4653  th->th.th_last_place = place;
4654  gap_ct++;
4655 
4656  if (place == last_place) {
4657  place = first_place;
4658  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4659  place = 0;
4660  } else {
4661  place++;
4662  }
4663 
4664  KA_TRACE(100,
4665  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4666  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4667  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4668  f, th->th.th_new_place, th->th.th_first_place,
4669  th->th.th_last_place, __kmp_affinity_num_masks));
4670  }
4671  } else {
4672  /* Having uniform space of available computation places I can create
4673  T partitions of round(P/T) size and put threads into the first
4674  place of each partition. */
4675  double current = static_cast<double>(masters_place);
4676  double spacing =
4677  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4678  int first, last;
4679  kmp_info_t *th;
4680 
4681  thidx = n_th + 1;
4682  if (update_master_only == 1)
4683  thidx = 1;
4684  for (f = 0; f < thidx; f++) {
4685  first = static_cast<int>(current);
4686  last = static_cast<int>(current + spacing) - 1;
4687  KMP_DEBUG_ASSERT(last >= first);
4688  if (first >= n_places) {
4689  if (masters_place) {
4690  first -= n_places;
4691  last -= n_places;
4692  if (first == (masters_place + 1)) {
4693  KMP_DEBUG_ASSERT(f == n_th);
4694  first--;
4695  }
4696  if (last == masters_place) {
4697  KMP_DEBUG_ASSERT(f == (n_th - 1));
4698  last--;
4699  }
4700  } else {
4701  KMP_DEBUG_ASSERT(f == n_th);
4702  first = 0;
4703  last = 0;
4704  }
4705  }
4706  if (last >= n_places) {
4707  last = (n_places - 1);
4708  }
4709  place = first;
4710  current += spacing;
4711  if (f < n_th) {
4712  KMP_DEBUG_ASSERT(0 <= first);
4713  KMP_DEBUG_ASSERT(n_places > first);
4714  KMP_DEBUG_ASSERT(0 <= last);
4715  KMP_DEBUG_ASSERT(n_places > last);
4716  KMP_DEBUG_ASSERT(last_place >= first_place);
4717  th = team->t.t_threads[f];
4718  KMP_DEBUG_ASSERT(th);
4719  th->th.th_first_place = first;
4720  th->th.th_new_place = place;
4721  th->th.th_last_place = last;
4722  if (__kmp_display_affinity && place != th->th.th_current_place &&
4723  team->t.t_display_affinity != 1) {
4724  team->t.t_display_affinity = 1;
4725  }
4726  KA_TRACE(100,
4727  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4728  "partition = [%d,%d], spacing = %.4f\n",
4729  __kmp_gtid_from_thread(team->t.t_threads[f]),
4730  team->t.t_id, f, th->th.th_new_place,
4731  th->th.th_first_place, th->th.th_last_place, spacing));
4732  }
4733  }
4734  }
4735  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4736  } else {
4737  int S, rem, gap, s_count;
4738  S = n_th / n_places;
4739  s_count = 0;
4740  rem = n_th - (S * n_places);
4741  gap = rem > 0 ? n_places / rem : n_places;
4742  int place = masters_place;
4743  int gap_ct = gap;
4744  thidx = n_th;
4745  if (update_master_only == 1)
4746  thidx = 1;
4747  for (f = 0; f < thidx; f++) {
4748  kmp_info_t *th = team->t.t_threads[f];
4749  KMP_DEBUG_ASSERT(th != NULL);
4750 
4751  th->th.th_first_place = place;
4752  th->th.th_last_place = place;
4753  th->th.th_new_place = place;
4754  if (__kmp_display_affinity && place != th->th.th_current_place &&
4755  team->t.t_display_affinity != 1) {
4756  team->t.t_display_affinity = 1;
4757  }
4758  s_count++;
4759 
4760  if ((s_count == S) && rem && (gap_ct == gap)) {
4761  // do nothing, add an extra thread to place on next iteration
4762  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4763  // we added an extra thread to this place; move on to next place
4764  if (place == last_place) {
4765  place = first_place;
4766  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4767  place = 0;
4768  } else {
4769  place++;
4770  }
4771  s_count = 0;
4772  gap_ct = 1;
4773  rem--;
4774  } else if (s_count == S) { // place is full; don't add extra thread
4775  if (place == last_place) {
4776  place = first_place;
4777  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4778  place = 0;
4779  } else {
4780  place++;
4781  }
4782  gap_ct++;
4783  s_count = 0;
4784  }
4785 
4786  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4787  "partition = [%d,%d]\n",
4788  __kmp_gtid_from_thread(team->t.t_threads[f]),
4789  team->t.t_id, f, th->th.th_new_place,
4790  th->th.th_first_place, th->th.th_last_place));
4791  }
4792  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4793  }
4794  } break;
4795 
4796  default:
4797  break;
4798  }
4799 
4800  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4801 }
4802 
4803 #endif // KMP_AFFINITY_SUPPORTED
4804 
4805 /* allocate a new team data structure to use. take one off of the free pool if
4806  available */
4807 kmp_team_t *
4808 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4809 #if OMPT_SUPPORT
4810  ompt_data_t ompt_parallel_data,
4811 #endif
4812  kmp_proc_bind_t new_proc_bind,
4813  kmp_internal_control_t *new_icvs,
4814  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4815  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4816  int f;
4817  kmp_team_t *team;
4818  int use_hot_team = !root->r.r_active;
4819  int level = 0;
4820 
4821  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4822  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4823  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4824  KMP_MB();
4825 
4826 #if KMP_NESTED_HOT_TEAMS
4827  kmp_hot_team_ptr_t *hot_teams;
4828  if (master) {
4829  team = master->th.th_team;
4830  level = team->t.t_active_level;
4831  if (master->th.th_teams_microtask) { // in teams construct?
4832  if (master->th.th_teams_size.nteams > 1 &&
4833  ( // #teams > 1
4834  team->t.t_pkfn ==
4835  (microtask_t)__kmp_teams_master || // inner fork of the teams
4836  master->th.th_teams_level <
4837  team->t.t_level)) { // or nested parallel inside the teams
4838  ++level; // not increment if #teams==1, or for outer fork of the teams;
4839  // increment otherwise
4840  }
4841  }
4842  hot_teams = master->th.th_hot_teams;
4843  if (level < __kmp_hot_teams_max_level && hot_teams &&
4844  hot_teams[level]
4845  .hot_team) { // hot team has already been allocated for given level
4846  use_hot_team = 1;
4847  } else {
4848  use_hot_team = 0;
4849  }
4850  }
4851 #endif
4852  // Optimization to use a "hot" team
4853  if (use_hot_team && new_nproc > 1) {
4854  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4855 #if KMP_NESTED_HOT_TEAMS
4856  team = hot_teams[level].hot_team;
4857 #else
4858  team = root->r.r_hot_team;
4859 #endif
4860 #if KMP_DEBUG
4861  if (__kmp_tasking_mode != tskm_immediate_exec) {
4862  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4863  "task_team[1] = %p before reinit\n",
4864  team->t.t_task_team[0], team->t.t_task_team[1]));
4865  }
4866 #endif
4867 
4868  // Has the number of threads changed?
4869  /* Let's assume the most common case is that the number of threads is
4870  unchanged, and put that case first. */
4871  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4872  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4873  // This case can mean that omp_set_num_threads() was called and the hot
4874  // team size was already reduced, so we check the special flag
4875  if (team->t.t_size_changed == -1) {
4876  team->t.t_size_changed = 1;
4877  } else {
4878  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4879  }
4880 
4881  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4882  kmp_r_sched_t new_sched = new_icvs->sched;
4883  // set master's schedule as new run-time schedule
4884  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4885 
4886  __kmp_reinitialize_team(team, new_icvs,
4887  root->r.r_uber_thread->th.th_ident);
4888 
4889  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4890  team->t.t_threads[0], team));
4891  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4892 
4893 #if KMP_AFFINITY_SUPPORTED
4894  if ((team->t.t_size_changed == 0) &&
4895  (team->t.t_proc_bind == new_proc_bind)) {
4896  if (new_proc_bind == proc_bind_spread) {
4897  __kmp_partition_places(
4898  team, 1); // add flag to update only master for spread
4899  }
4900  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4901  "proc_bind = %d, partition = [%d,%d]\n",
4902  team->t.t_id, new_proc_bind, team->t.t_first_place,
4903  team->t.t_last_place));
4904  } else {
4905  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4906  __kmp_partition_places(team);
4907  }
4908 #else
4909  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4910 #endif /* KMP_AFFINITY_SUPPORTED */
4911  } else if (team->t.t_nproc > new_nproc) {
4912  KA_TRACE(20,
4913  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4914  new_nproc));
4915 
4916  team->t.t_size_changed = 1;
4917 #if KMP_NESTED_HOT_TEAMS
4918  if (__kmp_hot_teams_mode == 0) {
4919  // AC: saved number of threads should correspond to team's value in this
4920  // mode, can be bigger in mode 1, when hot team has threads in reserve
4921  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4922  hot_teams[level].hot_team_nth = new_nproc;
4923 #endif // KMP_NESTED_HOT_TEAMS
4924  /* release the extra threads we don't need any more */
4925  for (f = new_nproc; f < team->t.t_nproc; f++) {
4926  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4927  if (__kmp_tasking_mode != tskm_immediate_exec) {
4928  // When decreasing team size, threads no longer in the team should
4929  // unref task team.
4930  team->t.t_threads[f]->th.th_task_team = NULL;
4931  }
4932  __kmp_free_thread(team->t.t_threads[f]);
4933  team->t.t_threads[f] = NULL;
4934  }
4935 #if KMP_NESTED_HOT_TEAMS
4936  } // (__kmp_hot_teams_mode == 0)
4937  else {
4938  // When keeping extra threads in team, switch threads to wait on own
4939  // b_go flag
4940  for (f = new_nproc; f < team->t.t_nproc; ++f) {
4941  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4942  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4943  for (int b = 0; b < bs_last_barrier; ++b) {
4944  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4945  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4946  }
4947  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4948  }
4949  }
4950  }
4951 #endif // KMP_NESTED_HOT_TEAMS
4952  team->t.t_nproc = new_nproc;
4953  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4954  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4955  __kmp_reinitialize_team(team, new_icvs,
4956  root->r.r_uber_thread->th.th_ident);
4957 
4958  // Update remaining threads
4959  for (f = 0; f < new_nproc; ++f) {
4960  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4961  }
4962 
4963  // restore the current task state of the master thread: should be the
4964  // implicit task
4965  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4966  team->t.t_threads[0], team));
4967 
4968  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4969 
4970 #ifdef KMP_DEBUG
4971  for (f = 0; f < team->t.t_nproc; f++) {
4972  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4973  team->t.t_threads[f]->th.th_team_nproc ==
4974  team->t.t_nproc);
4975  }
4976 #endif
4977 
4978  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4979 #if KMP_AFFINITY_SUPPORTED
4980  __kmp_partition_places(team);
4981 #endif
4982  } else { // team->t.t_nproc < new_nproc
4983 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4984  kmp_affin_mask_t *old_mask;
4985  if (KMP_AFFINITY_CAPABLE()) {
4986  KMP_CPU_ALLOC(old_mask);
4987  }
4988 #endif
4989 
4990  KA_TRACE(20,
4991  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
4992  new_nproc));
4993 
4994  team->t.t_size_changed = 1;
4995 
4996 #if KMP_NESTED_HOT_TEAMS
4997  int avail_threads = hot_teams[level].hot_team_nth;
4998  if (new_nproc < avail_threads)
4999  avail_threads = new_nproc;
5000  kmp_info_t **other_threads = team->t.t_threads;
5001  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5002  // Adjust barrier data of reserved threads (if any) of the team
5003  // Other data will be set in __kmp_initialize_info() below.
5004  int b;
5005  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5006  for (b = 0; b < bs_last_barrier; ++b) {
5007  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5008  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5009 #if USE_DEBUGGER
5010  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5011 #endif
5012  }
5013  }
5014  if (hot_teams[level].hot_team_nth >= new_nproc) {
5015  // we have all needed threads in reserve, no need to allocate any
5016  // this only possible in mode 1, cannot have reserved threads in mode 0
5017  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5018  team->t.t_nproc = new_nproc; // just get reserved threads involved
5019  } else {
5020  // we may have some threads in reserve, but not enough
5021  team->t.t_nproc =
5022  hot_teams[level]
5023  .hot_team_nth; // get reserved threads involved if any
5024  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5025 #endif // KMP_NESTED_HOT_TEAMS
5026  if (team->t.t_max_nproc < new_nproc) {
5027  /* reallocate larger arrays */
5028  __kmp_reallocate_team_arrays(team, new_nproc);
5029  __kmp_reinitialize_team(team, new_icvs, NULL);
5030  }
5031 
5032 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5033  /* Temporarily set full mask for master thread before creation of
5034  workers. The reason is that workers inherit the affinity from master,
5035  so if a lot of workers are created on the single core quickly, they
5036  don't get a chance to set their own affinity for a long time. */
5037  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5038 #endif
5039 
5040  /* allocate new threads for the hot team */
5041  for (f = team->t.t_nproc; f < new_nproc; f++) {
5042  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5043  KMP_DEBUG_ASSERT(new_worker);
5044  team->t.t_threads[f] = new_worker;
5045 
5046  KA_TRACE(20,
5047  ("__kmp_allocate_team: team %d init T#%d arrived: "
5048  "join=%llu, plain=%llu\n",
5049  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5050  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5051  team->t.t_bar[bs_plain_barrier].b_arrived));
5052 
5053  { // Initialize barrier data for new threads.
5054  int b;
5055  kmp_balign_t *balign = new_worker->th.th_bar;
5056  for (b = 0; b < bs_last_barrier; ++b) {
5057  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5058  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5059  KMP_BARRIER_PARENT_FLAG);
5060 #if USE_DEBUGGER
5061  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5062 #endif
5063  }
5064  }
5065  }
5066 
5067 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5068  if (KMP_AFFINITY_CAPABLE()) {
5069  /* Restore initial master thread's affinity mask */
5070  __kmp_set_system_affinity(old_mask, TRUE);
5071  KMP_CPU_FREE(old_mask);
5072  }
5073 #endif
5074 #if KMP_NESTED_HOT_TEAMS
5075  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5076 #endif // KMP_NESTED_HOT_TEAMS
5077  /* make sure everyone is syncronized */
5078  int old_nproc = team->t.t_nproc; // save old value and use to update only
5079  // new threads below
5080  __kmp_initialize_team(team, new_nproc, new_icvs,
5081  root->r.r_uber_thread->th.th_ident);
5082 
5083  /* reinitialize the threads */
5084  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5085  for (f = 0; f < team->t.t_nproc; ++f)
5086  __kmp_initialize_info(team->t.t_threads[f], team, f,
5087  __kmp_gtid_from_tid(f, team));
5088 
5089  if (level) { // set th_task_state for new threads in nested hot team
5090  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5091  // only need to set the th_task_state for the new threads. th_task_state
5092  // for master thread will not be accurate until after this in
5093  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5094  // correct value.
5095  for (f = old_nproc; f < team->t.t_nproc; ++f)
5096  team->t.t_threads[f]->th.th_task_state =
5097  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5098  } else { // set th_task_state for new threads in non-nested hot team
5099  int old_state =
5100  team->t.t_threads[0]->th.th_task_state; // copy master's state
5101  for (f = old_nproc; f < team->t.t_nproc; ++f)
5102  team->t.t_threads[f]->th.th_task_state = old_state;
5103  }
5104 
5105 #ifdef KMP_DEBUG
5106  for (f = 0; f < team->t.t_nproc; ++f) {
5107  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5108  team->t.t_threads[f]->th.th_team_nproc ==
5109  team->t.t_nproc);
5110  }
5111 #endif
5112 
5113  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5114 #if KMP_AFFINITY_SUPPORTED
5115  __kmp_partition_places(team);
5116 #endif
5117  } // Check changes in number of threads
5118 
5119  kmp_info_t *master = team->t.t_threads[0];
5120  if (master->th.th_teams_microtask) {
5121  for (f = 1; f < new_nproc; ++f) {
5122  // propagate teams construct specific info to workers
5123  kmp_info_t *thr = team->t.t_threads[f];
5124  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5125  thr->th.th_teams_level = master->th.th_teams_level;
5126  thr->th.th_teams_size = master->th.th_teams_size;
5127  }
5128  }
5129 #if KMP_NESTED_HOT_TEAMS
5130  if (level) {
5131  // Sync barrier state for nested hot teams, not needed for outermost hot
5132  // team.
5133  for (f = 1; f < new_nproc; ++f) {
5134  kmp_info_t *thr = team->t.t_threads[f];
5135  int b;
5136  kmp_balign_t *balign = thr->th.th_bar;
5137  for (b = 0; b < bs_last_barrier; ++b) {
5138  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5139  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5140 #if USE_DEBUGGER
5141  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5142 #endif
5143  }
5144  }
5145  }
5146 #endif // KMP_NESTED_HOT_TEAMS
5147 
5148  /* reallocate space for arguments if necessary */
5149  __kmp_alloc_argv_entries(argc, team, TRUE);
5150  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5151  // The hot team re-uses the previous task team,
5152  // if untouched during the previous release->gather phase.
5153 
5154  KF_TRACE(10, (" hot_team = %p\n", team));
5155 
5156 #if KMP_DEBUG
5157  if (__kmp_tasking_mode != tskm_immediate_exec) {
5158  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5159  "task_team[1] = %p after reinit\n",
5160  team->t.t_task_team[0], team->t.t_task_team[1]));
5161  }
5162 #endif
5163 
5164 #if OMPT_SUPPORT
5165  __ompt_team_assign_id(team, ompt_parallel_data);
5166 #endif
5167 
5168  KMP_MB();
5169 
5170  return team;
5171  }
5172 
5173  /* next, let's try to take one from the team pool */
5174  KMP_MB();
5175  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5176  /* TODO: consider resizing undersized teams instead of reaping them, now
5177  that we have a resizing mechanism */
5178  if (team->t.t_max_nproc >= max_nproc) {
5179  /* take this team from the team pool */
5180  __kmp_team_pool = team->t.t_next_pool;
5181 
5182  /* setup the team for fresh use */
5183  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5184 
5185  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5186  "task_team[1] %p to NULL\n",
5187  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5188  team->t.t_task_team[0] = NULL;
5189  team->t.t_task_team[1] = NULL;
5190 
5191  /* reallocate space for arguments if necessary */
5192  __kmp_alloc_argv_entries(argc, team, TRUE);
5193  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5194 
5195  KA_TRACE(
5196  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5197  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5198  { // Initialize barrier data.
5199  int b;
5200  for (b = 0; b < bs_last_barrier; ++b) {
5201  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5202 #if USE_DEBUGGER
5203  team->t.t_bar[b].b_master_arrived = 0;
5204  team->t.t_bar[b].b_team_arrived = 0;
5205 #endif
5206  }
5207  }
5208 
5209  team->t.t_proc_bind = new_proc_bind;
5210 
5211  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5212  team->t.t_id));
5213 
5214 #if OMPT_SUPPORT
5215  __ompt_team_assign_id(team, ompt_parallel_data);
5216 #endif
5217 
5218  KMP_MB();
5219 
5220  return team;
5221  }
5222 
5223  /* reap team if it is too small, then loop back and check the next one */
5224  // not sure if this is wise, but, will be redone during the hot-teams
5225  // rewrite.
5226  /* TODO: Use technique to find the right size hot-team, don't reap them */
5227  team = __kmp_reap_team(team);
5228  __kmp_team_pool = team;
5229  }
5230 
5231  /* nothing available in the pool, no matter, make a new team! */
5232  KMP_MB();
5233  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5234 
5235  /* and set it up */
5236  team->t.t_max_nproc = max_nproc;
5237  /* NOTE well, for some reason allocating one big buffer and dividing it up
5238  seems to really hurt performance a lot on the P4, so, let's not use this */
5239  __kmp_allocate_team_arrays(team, max_nproc);
5240 
5241  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5242  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5243 
5244  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5245  "%p to NULL\n",
5246  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5247  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5248  // memory, no need to duplicate
5249  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5250  // memory, no need to duplicate
5251 
5252  if (__kmp_storage_map) {
5253  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5254  }
5255 
5256  /* allocate space for arguments */
5257  __kmp_alloc_argv_entries(argc, team, FALSE);
5258  team->t.t_argc = argc;
5259 
5260  KA_TRACE(20,
5261  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5262  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5263  { // Initialize barrier data.
5264  int b;
5265  for (b = 0; b < bs_last_barrier; ++b) {
5266  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5267 #if USE_DEBUGGER
5268  team->t.t_bar[b].b_master_arrived = 0;
5269  team->t.t_bar[b].b_team_arrived = 0;
5270 #endif
5271  }
5272  }
5273 
5274  team->t.t_proc_bind = new_proc_bind;
5275 
5276 #if OMPT_SUPPORT
5277  __ompt_team_assign_id(team, ompt_parallel_data);
5278  team->t.ompt_serialized_team_info = NULL;
5279 #endif
5280 
5281  KMP_MB();
5282 
5283  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5284  team->t.t_id));
5285 
5286  return team;
5287 }
5288 
5289 /* TODO implement hot-teams at all levels */
5290 /* TODO implement lazy thread release on demand (disband request) */
5291 
5292 /* free the team. return it to the team pool. release all the threads
5293  * associated with it */
5294 void __kmp_free_team(kmp_root_t *root,
5295  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5296  int f;
5297  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5298  team->t.t_id));
5299 
5300  /* verify state */
5301  KMP_DEBUG_ASSERT(root);
5302  KMP_DEBUG_ASSERT(team);
5303  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5304  KMP_DEBUG_ASSERT(team->t.t_threads);
5305 
5306  int use_hot_team = team == root->r.r_hot_team;
5307 #if KMP_NESTED_HOT_TEAMS
5308  int level;
5309  kmp_hot_team_ptr_t *hot_teams;
5310  if (master) {
5311  level = team->t.t_active_level - 1;
5312  if (master->th.th_teams_microtask) { // in teams construct?
5313  if (master->th.th_teams_size.nteams > 1) {
5314  ++level; // level was not increased in teams construct for
5315  // team_of_masters
5316  }
5317  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5318  master->th.th_teams_level == team->t.t_level) {
5319  ++level; // level was not increased in teams construct for
5320  // team_of_workers before the parallel
5321  } // team->t.t_level will be increased inside parallel
5322  }
5323  hot_teams = master->th.th_hot_teams;
5324  if (level < __kmp_hot_teams_max_level) {
5325  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5326  use_hot_team = 1;
5327  }
5328  }
5329 #endif // KMP_NESTED_HOT_TEAMS
5330 
5331  /* team is done working */
5332  TCW_SYNC_PTR(team->t.t_pkfn,
5333  NULL); // Important for Debugging Support Library.
5334 #if KMP_OS_WINDOWS
5335  team->t.t_copyin_counter = 0; // init counter for possible reuse
5336 #endif
5337  // Do not reset pointer to parent team to NULL for hot teams.
5338 
5339  /* if we are non-hot team, release our threads */
5340  if (!use_hot_team) {
5341  if (__kmp_tasking_mode != tskm_immediate_exec) {
5342  // Wait for threads to reach reapable state
5343  for (f = 1; f < team->t.t_nproc; ++f) {
5344  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5345  kmp_info_t *th = team->t.t_threads[f];
5346  volatile kmp_uint32 *state = &th->th.th_reap_state;
5347  while (*state != KMP_SAFE_TO_REAP) {
5348 #if KMP_OS_WINDOWS
5349  // On Windows a thread can be killed at any time, check this
5350  DWORD ecode;
5351  if (!__kmp_is_thread_alive(th, &ecode)) {
5352  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5353  break;
5354  }
5355 #endif
5356  // first check if thread is sleeping
5357  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5358  if (fl.is_sleeping())
5359  fl.resume(__kmp_gtid_from_thread(th));
5360  KMP_CPU_PAUSE();
5361  }
5362  }
5363 
5364  // Delete task teams
5365  int tt_idx;
5366  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5367  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5368  if (task_team != NULL) {
5369  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5370  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5371  team->t.t_threads[f]->th.th_task_team = NULL;
5372  }
5373  KA_TRACE(
5374  20,
5375  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5376  __kmp_get_gtid(), task_team, team->t.t_id));
5377 #if KMP_NESTED_HOT_TEAMS
5378  __kmp_free_task_team(master, task_team);
5379 #endif
5380  team->t.t_task_team[tt_idx] = NULL;
5381  }
5382  }
5383  }
5384 
5385  // Reset pointer to parent team only for non-hot teams.
5386  team->t.t_parent = NULL;
5387  team->t.t_level = 0;
5388  team->t.t_active_level = 0;
5389 
5390  /* free the worker threads */
5391  for (f = 1; f < team->t.t_nproc; ++f) {
5392  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5393  __kmp_free_thread(team->t.t_threads[f]);
5394  team->t.t_threads[f] = NULL;
5395  }
5396 
5397  /* put the team back in the team pool */
5398  /* TODO limit size of team pool, call reap_team if pool too large */
5399  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5400  __kmp_team_pool = (volatile kmp_team_t *)team;
5401  } else { // Check if team was created for the masters in a teams construct
5402  // See if first worker is a CG root
5403  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5404  team->t.t_threads[1]->th.th_cg_roots);
5405  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5406  // Clean up the CG root nodes on workers so that this team can be re-used
5407  for (f = 1; f < team->t.t_nproc; ++f) {
5408  kmp_info_t *thr = team->t.t_threads[f];
5409  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5410  thr->th.th_cg_roots->cg_root == thr);
5411  // Pop current CG root off list
5412  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5413  thr->th.th_cg_roots = tmp->up;
5414  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5415  " up to node %p. cg_nthreads was %d\n",
5416  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5417  int i = tmp->cg_nthreads--;
5418  if (i == 1) {
5419  __kmp_free(tmp); // free CG if we are the last thread in it
5420  }
5421  // Restore current task's thread_limit from CG root
5422  if (thr->th.th_cg_roots)
5423  thr->th.th_current_task->td_icvs.thread_limit =
5424  thr->th.th_cg_roots->cg_thread_limit;
5425  }
5426  }
5427  }
5428 
5429  KMP_MB();
5430 }
5431 
5432 /* reap the team. destroy it, reclaim all its resources and free its memory */
5433 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5434  kmp_team_t *next_pool = team->t.t_next_pool;
5435 
5436  KMP_DEBUG_ASSERT(team);
5437  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5438  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5439  KMP_DEBUG_ASSERT(team->t.t_threads);
5440  KMP_DEBUG_ASSERT(team->t.t_argv);
5441 
5442  /* TODO clean the threads that are a part of this? */
5443 
5444  /* free stuff */
5445  __kmp_free_team_arrays(team);
5446  if (team->t.t_argv != &team->t.t_inline_argv[0])
5447  __kmp_free((void *)team->t.t_argv);
5448  __kmp_free(team);
5449 
5450  KMP_MB();
5451  return next_pool;
5452 }
5453 
5454 // Free the thread. Don't reap it, just place it on the pool of available
5455 // threads.
5456 //
5457 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5458 // binding for the affinity mechanism to be useful.
5459 //
5460 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5461 // However, we want to avoid a potential performance problem by always
5462 // scanning through the list to find the correct point at which to insert
5463 // the thread (potential N**2 behavior). To do this we keep track of the
5464 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5465 // With single-level parallelism, threads will always be added to the tail
5466 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5467 // parallelism, all bets are off and we may need to scan through the entire
5468 // free list.
5469 //
5470 // This change also has a potentially large performance benefit, for some
5471 // applications. Previously, as threads were freed from the hot team, they
5472 // would be placed back on the free list in inverse order. If the hot team
5473 // grew back to it's original size, then the freed thread would be placed
5474 // back on the hot team in reverse order. This could cause bad cache
5475 // locality problems on programs where the size of the hot team regularly
5476 // grew and shrunk.
5477 //
5478 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5479 void __kmp_free_thread(kmp_info_t *this_th) {
5480  int gtid;
5481  kmp_info_t **scan;
5482 
5483  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5484  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5485 
5486  KMP_DEBUG_ASSERT(this_th);
5487 
5488  // When moving thread to pool, switch thread to wait on own b_go flag, and
5489  // uninitialized (NULL team).
5490  int b;
5491  kmp_balign_t *balign = this_th->th.th_bar;
5492  for (b = 0; b < bs_last_barrier; ++b) {
5493  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5494  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5495  balign[b].bb.team = NULL;
5496  balign[b].bb.leaf_kids = 0;
5497  }
5498  this_th->th.th_task_state = 0;
5499  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5500 
5501  /* put thread back on the free pool */
5502  TCW_PTR(this_th->th.th_team, NULL);
5503  TCW_PTR(this_th->th.th_root, NULL);
5504  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5505 
5506  while (this_th->th.th_cg_roots) {
5507  this_th->th.th_cg_roots->cg_nthreads--;
5508  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5509  " %p of thread %p to %d\n",
5510  this_th, this_th->th.th_cg_roots,
5511  this_th->th.th_cg_roots->cg_root,
5512  this_th->th.th_cg_roots->cg_nthreads));
5513  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5514  if (tmp->cg_root == this_th) { // Thread is a cg_root
5515  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5516  KA_TRACE(
5517  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5518  this_th->th.th_cg_roots = tmp->up;
5519  __kmp_free(tmp);
5520  } else { // Worker thread
5521  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5522  __kmp_free(tmp);
5523  }
5524  this_th->th.th_cg_roots = NULL;
5525  break;
5526  }
5527  }
5528 
5529  /* If the implicit task assigned to this thread can be used by other threads
5530  * -> multiple threads can share the data and try to free the task at
5531  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5532  * with higher probability when hot team is disabled but can occurs even when
5533  * the hot team is enabled */
5534  __kmp_free_implicit_task(this_th);
5535  this_th->th.th_current_task = NULL;
5536 
5537  // If the __kmp_thread_pool_insert_pt is already past the new insert
5538  // point, then we need to re-scan the entire list.
5539  gtid = this_th->th.th_info.ds.ds_gtid;
5540  if (__kmp_thread_pool_insert_pt != NULL) {
5541  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5542  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5543  __kmp_thread_pool_insert_pt = NULL;
5544  }
5545  }
5546 
5547  // Scan down the list to find the place to insert the thread.
5548  // scan is the address of a link in the list, possibly the address of
5549  // __kmp_thread_pool itself.
5550  //
5551  // In the absence of nested parallism, the for loop will have 0 iterations.
5552  if (__kmp_thread_pool_insert_pt != NULL) {
5553  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5554  } else {
5555  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5556  }
5557  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5558  scan = &((*scan)->th.th_next_pool))
5559  ;
5560 
5561  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5562  // to its address.
5563  TCW_PTR(this_th->th.th_next_pool, *scan);
5564  __kmp_thread_pool_insert_pt = *scan = this_th;
5565  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5566  (this_th->th.th_info.ds.ds_gtid <
5567  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5568  TCW_4(this_th->th.th_in_pool, TRUE);
5569  __kmp_suspend_initialize_thread(this_th);
5570  __kmp_lock_suspend_mx(this_th);
5571  if (this_th->th.th_active == TRUE) {
5572  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5573  this_th->th.th_active_in_pool = TRUE;
5574  }
5575 #if KMP_DEBUG
5576  else {
5577  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5578  }
5579 #endif
5580  __kmp_unlock_suspend_mx(this_th);
5581 
5582  TCW_4(__kmp_nth, __kmp_nth - 1);
5583 
5584 #ifdef KMP_ADJUST_BLOCKTIME
5585  /* Adjust blocktime back to user setting or default if necessary */
5586  /* Middle initialization might never have occurred */
5587  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5588  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5589  if (__kmp_nth <= __kmp_avail_proc) {
5590  __kmp_zero_bt = FALSE;
5591  }
5592  }
5593 #endif /* KMP_ADJUST_BLOCKTIME */
5594 
5595  KMP_MB();
5596 }
5597 
5598 /* ------------------------------------------------------------------------ */
5599 
5600 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5601  int gtid = this_thr->th.th_info.ds.ds_gtid;
5602  /* void *stack_data;*/
5603  kmp_team_t *(*volatile pteam);
5604 
5605  KMP_MB();
5606  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5607 
5608  if (__kmp_env_consistency_check) {
5609  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5610  }
5611 
5612 #if OMPT_SUPPORT
5613  ompt_data_t *thread_data;
5614  if (ompt_enabled.enabled) {
5615  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5616  *thread_data = ompt_data_none;
5617 
5618  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5619  this_thr->th.ompt_thread_info.wait_id = 0;
5620  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5621  if (ompt_enabled.ompt_callback_thread_begin) {
5622  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5623  ompt_thread_worker, thread_data);
5624  }
5625  }
5626 #endif
5627 
5628 #if OMPT_SUPPORT
5629  if (ompt_enabled.enabled) {
5630  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5631  }
5632 #endif
5633  /* This is the place where threads wait for work */
5634  while (!TCR_4(__kmp_global.g.g_done)) {
5635  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5636  KMP_MB();
5637 
5638  /* wait for work to do */
5639  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5640 
5641  /* No tid yet since not part of a team */
5642  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5643 
5644 #if OMPT_SUPPORT
5645  if (ompt_enabled.enabled) {
5646  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5647  }
5648 #endif
5649 
5650  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5651 
5652  /* have we been allocated? */
5653  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5654  /* we were just woken up, so run our new task */
5655  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5656  int rc;
5657  KA_TRACE(20,
5658  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5659  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5660  (*pteam)->t.t_pkfn));
5661 
5662  updateHWFPControl(*pteam);
5663 
5664 #if OMPT_SUPPORT
5665  if (ompt_enabled.enabled) {
5666  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5667  }
5668 #endif
5669 
5670  rc = (*pteam)->t.t_invoke(gtid);
5671  KMP_ASSERT(rc);
5672 
5673  KMP_MB();
5674  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5675  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5676  (*pteam)->t.t_pkfn));
5677  }
5678 #if OMPT_SUPPORT
5679  if (ompt_enabled.enabled) {
5680  /* no frame set while outside task */
5681  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5682 
5683  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5684  }
5685 #endif
5686  /* join barrier after parallel region */
5687  __kmp_join_barrier(gtid);
5688  }
5689  }
5690  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5691 
5692 #if OMPT_SUPPORT
5693  if (ompt_enabled.ompt_callback_thread_end) {
5694  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5695  }
5696 #endif
5697 
5698  this_thr->th.th_task_team = NULL;
5699  /* run the destructors for the threadprivate data for this thread */
5700  __kmp_common_destroy_gtid(gtid);
5701 
5702  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5703  KMP_MB();
5704  return this_thr;
5705 }
5706 
5707 /* ------------------------------------------------------------------------ */
5708 
5709 void __kmp_internal_end_dest(void *specific_gtid) {
5710 #if KMP_COMPILER_ICC
5711 #pragma warning(push)
5712 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5713 // significant bits
5714 #endif
5715  // Make sure no significant bits are lost
5716  int gtid = (kmp_intptr_t)specific_gtid - 1;
5717 #if KMP_COMPILER_ICC
5718 #pragma warning(pop)
5719 #endif
5720 
5721  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5722  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5723  * this is because 0 is reserved for the nothing-stored case */
5724 
5725  /* josh: One reason for setting the gtid specific data even when it is being
5726  destroyed by pthread is to allow gtid lookup through thread specific data
5727  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5728  that gets executed in the call to __kmp_internal_end_thread, actually
5729  gets the gtid through the thread specific data. Setting it here seems
5730  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5731  to run smoothly.
5732  todo: get rid of this after we remove the dependence on
5733  __kmp_gtid_get_specific */
5734  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5735  __kmp_gtid_set_specific(gtid);
5736 #ifdef KMP_TDATA_GTID
5737  __kmp_gtid = gtid;
5738 #endif
5739  __kmp_internal_end_thread(gtid);
5740 }
5741 
5742 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5743 
5744 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5745 // destructors work perfectly, but in real libomp.so I have no evidence it is
5746 // ever called. However, -fini linker option in makefile.mk works fine.
5747 
5748 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5749  __kmp_internal_end_atexit();
5750 }
5751 
5752 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5753 
5754 #endif
5755 
5756 /* [Windows] josh: when the atexit handler is called, there may still be more
5757  than one thread alive */
5758 void __kmp_internal_end_atexit(void) {
5759  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5760  /* [Windows]
5761  josh: ideally, we want to completely shutdown the library in this atexit
5762  handler, but stat code that depends on thread specific data for gtid fails
5763  because that data becomes unavailable at some point during the shutdown, so
5764  we call __kmp_internal_end_thread instead. We should eventually remove the
5765  dependency on __kmp_get_specific_gtid in the stat code and use
5766  __kmp_internal_end_library to cleanly shutdown the library.
5767 
5768  // TODO: Can some of this comment about GVS be removed?
5769  I suspect that the offending stat code is executed when the calling thread
5770  tries to clean up a dead root thread's data structures, resulting in GVS
5771  code trying to close the GVS structures for that thread, but since the stat
5772  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5773  the calling thread is cleaning up itself instead of another thread, it get
5774  confused. This happens because allowing a thread to unregister and cleanup
5775  another thread is a recent modification for addressing an issue.
5776  Based on the current design (20050722), a thread may end up
5777  trying to unregister another thread only if thread death does not trigger
5778  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5779  thread specific data destructor function to detect thread death. For
5780  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5781  is nothing. Thus, the workaround is applicable only for Windows static
5782  stat library. */
5783  __kmp_internal_end_library(-1);
5784 #if KMP_OS_WINDOWS
5785  __kmp_close_console();
5786 #endif
5787 }
5788 
5789 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5790  // It is assumed __kmp_forkjoin_lock is acquired.
5791 
5792  int gtid;
5793 
5794  KMP_DEBUG_ASSERT(thread != NULL);
5795 
5796  gtid = thread->th.th_info.ds.ds_gtid;
5797 
5798  if (!is_root) {
5799  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5800  /* Assume the threads are at the fork barrier here */
5801  KA_TRACE(
5802  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5803  gtid));
5804  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5805  * (GEH) */
5806  ANNOTATE_HAPPENS_BEFORE(thread);
5807  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5808  __kmp_release_64(&flag);
5809  }
5810 
5811  // Terminate OS thread.
5812  __kmp_reap_worker(thread);
5813 
5814  // The thread was killed asynchronously. If it was actively
5815  // spinning in the thread pool, decrement the global count.
5816  //
5817  // There is a small timing hole here - if the worker thread was just waking
5818  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5819  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5820  // the global counter might not get updated.
5821  //
5822  // Currently, this can only happen as the library is unloaded,
5823  // so there are no harmful side effects.
5824  if (thread->th.th_active_in_pool) {
5825  thread->th.th_active_in_pool = FALSE;
5826  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5827  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5828  }
5829  }
5830 
5831  __kmp_free_implicit_task(thread);
5832 
5833 // Free the fast memory for tasking
5834 #if USE_FAST_MEMORY
5835  __kmp_free_fast_memory(thread);
5836 #endif /* USE_FAST_MEMORY */
5837 
5838  __kmp_suspend_uninitialize_thread(thread);
5839 
5840  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5841  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5842 
5843  --__kmp_all_nth;
5844 // __kmp_nth was decremented when thread is added to the pool.
5845 
5846 #ifdef KMP_ADJUST_BLOCKTIME
5847  /* Adjust blocktime back to user setting or default if necessary */
5848  /* Middle initialization might never have occurred */
5849  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5850  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5851  if (__kmp_nth <= __kmp_avail_proc) {
5852  __kmp_zero_bt = FALSE;
5853  }
5854  }
5855 #endif /* KMP_ADJUST_BLOCKTIME */
5856 
5857  /* free the memory being used */
5858  if (__kmp_env_consistency_check) {
5859  if (thread->th.th_cons) {
5860  __kmp_free_cons_stack(thread->th.th_cons);
5861  thread->th.th_cons = NULL;
5862  }
5863  }
5864 
5865  if (thread->th.th_pri_common != NULL) {
5866  __kmp_free(thread->th.th_pri_common);
5867  thread->th.th_pri_common = NULL;
5868  }
5869 
5870  if (thread->th.th_task_state_memo_stack != NULL) {
5871  __kmp_free(thread->th.th_task_state_memo_stack);
5872  thread->th.th_task_state_memo_stack = NULL;
5873  }
5874 
5875 #if KMP_USE_BGET
5876  if (thread->th.th_local.bget_data != NULL) {
5877  __kmp_finalize_bget(thread);
5878  }
5879 #endif
5880 
5881 #if KMP_AFFINITY_SUPPORTED
5882  if (thread->th.th_affin_mask != NULL) {
5883  KMP_CPU_FREE(thread->th.th_affin_mask);
5884  thread->th.th_affin_mask = NULL;
5885  }
5886 #endif /* KMP_AFFINITY_SUPPORTED */
5887 
5888 #if KMP_USE_HIER_SCHED
5889  if (thread->th.th_hier_bar_data != NULL) {
5890  __kmp_free(thread->th.th_hier_bar_data);
5891  thread->th.th_hier_bar_data = NULL;
5892  }
5893 #endif
5894 
5895  __kmp_reap_team(thread->th.th_serial_team);
5896  thread->th.th_serial_team = NULL;
5897  __kmp_free(thread);
5898 
5899  KMP_MB();
5900 
5901 } // __kmp_reap_thread
5902 
5903 static void __kmp_internal_end(void) {
5904  int i;
5905 
5906  /* First, unregister the library */
5907  __kmp_unregister_library();
5908 
5909 #if KMP_OS_WINDOWS
5910  /* In Win static library, we can't tell when a root actually dies, so we
5911  reclaim the data structures for any root threads that have died but not
5912  unregistered themselves, in order to shut down cleanly.
5913  In Win dynamic library we also can't tell when a thread dies. */
5914  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5915 // dead roots
5916 #endif
5917 
5918  for (i = 0; i < __kmp_threads_capacity; i++)
5919  if (__kmp_root[i])
5920  if (__kmp_root[i]->r.r_active)
5921  break;
5922  KMP_MB(); /* Flush all pending memory write invalidates. */
5923  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5924 
5925  if (i < __kmp_threads_capacity) {
5926 #if KMP_USE_MONITOR
5927  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5928  KMP_MB(); /* Flush all pending memory write invalidates. */
5929 
5930  // Need to check that monitor was initialized before reaping it. If we are
5931  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5932  // __kmp_monitor will appear to contain valid data, but it is only valid in
5933  // the parent process, not the child.
5934  // New behavior (201008): instead of keying off of the flag
5935  // __kmp_init_parallel, the monitor thread creation is keyed off
5936  // of the new flag __kmp_init_monitor.
5937  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5938  if (TCR_4(__kmp_init_monitor)) {
5939  __kmp_reap_monitor(&__kmp_monitor);
5940  TCW_4(__kmp_init_monitor, 0);
5941  }
5942  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5943  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5944 #endif // KMP_USE_MONITOR
5945  } else {
5946 /* TODO move this to cleanup code */
5947 #ifdef KMP_DEBUG
5948  /* make sure that everything has properly ended */
5949  for (i = 0; i < __kmp_threads_capacity; i++) {
5950  if (__kmp_root[i]) {
5951  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
5952  // there can be uber threads alive here
5953  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5954  }
5955  }
5956 #endif
5957 
5958  KMP_MB();
5959 
5960  // Reap the worker threads.
5961  // This is valid for now, but be careful if threads are reaped sooner.
5962  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5963  // Get the next thread from the pool.
5964  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5965  __kmp_thread_pool = thread->th.th_next_pool;
5966  // Reap it.
5967  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5968  thread->th.th_next_pool = NULL;
5969  thread->th.th_in_pool = FALSE;
5970  __kmp_reap_thread(thread, 0);
5971  }
5972  __kmp_thread_pool_insert_pt = NULL;
5973 
5974  // Reap teams.
5975  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5976  // Get the next team from the pool.
5977  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5978  __kmp_team_pool = team->t.t_next_pool;
5979  // Reap it.
5980  team->t.t_next_pool = NULL;
5981  __kmp_reap_team(team);
5982  }
5983 
5984  __kmp_reap_task_teams();
5985 
5986 #if KMP_OS_UNIX
5987  // Threads that are not reaped should not access any resources since they
5988  // are going to be deallocated soon, so the shutdown sequence should wait
5989  // until all threads either exit the final spin-waiting loop or begin
5990  // sleeping after the given blocktime.
5991  for (i = 0; i < __kmp_threads_capacity; i++) {
5992  kmp_info_t *thr = __kmp_threads[i];
5993  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
5994  KMP_CPU_PAUSE();
5995  }
5996 #endif
5997 
5998  for (i = 0; i < __kmp_threads_capacity; ++i) {
5999  // TBD: Add some checking...
6000  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6001  }
6002 
6003  /* Make sure all threadprivate destructors get run by joining with all
6004  worker threads before resetting this flag */
6005  TCW_SYNC_4(__kmp_init_common, FALSE);
6006 
6007  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6008  KMP_MB();
6009 
6010 #if KMP_USE_MONITOR
6011  // See note above: One of the possible fixes for CQ138434 / CQ140126
6012  //
6013  // FIXME: push both code fragments down and CSE them?
6014  // push them into __kmp_cleanup() ?
6015  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6016  if (TCR_4(__kmp_init_monitor)) {
6017  __kmp_reap_monitor(&__kmp_monitor);
6018  TCW_4(__kmp_init_monitor, 0);
6019  }
6020  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6021  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6022 #endif
6023  } /* else !__kmp_global.t_active */
6024  TCW_4(__kmp_init_gtid, FALSE);
6025  KMP_MB(); /* Flush all pending memory write invalidates. */
6026 
6027  __kmp_cleanup();
6028 #if OMPT_SUPPORT
6029  ompt_fini();
6030 #endif
6031 }
6032 
6033 void __kmp_internal_end_library(int gtid_req) {
6034  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6035  /* this shouldn't be a race condition because __kmp_internal_end() is the
6036  only place to clear __kmp_serial_init */
6037  /* we'll check this later too, after we get the lock */
6038  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6039  // redundaant, because the next check will work in any case.
6040  if (__kmp_global.g.g_abort) {
6041  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6042  /* TODO abort? */
6043  return;
6044  }
6045  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6046  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6047  return;
6048  }
6049 
6050  KMP_MB(); /* Flush all pending memory write invalidates. */
6051 
6052  /* find out who we are and what we should do */
6053  {
6054  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6055  KA_TRACE(
6056  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6057  if (gtid == KMP_GTID_SHUTDOWN) {
6058  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6059  "already shutdown\n"));
6060  return;
6061  } else if (gtid == KMP_GTID_MONITOR) {
6062  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6063  "registered, or system shutdown\n"));
6064  return;
6065  } else if (gtid == KMP_GTID_DNE) {
6066  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6067  "shutdown\n"));
6068  /* we don't know who we are, but we may still shutdown the library */
6069  } else if (KMP_UBER_GTID(gtid)) {
6070  /* unregister ourselves as an uber thread. gtid is no longer valid */
6071  if (__kmp_root[gtid]->r.r_active) {
6072  __kmp_global.g.g_abort = -1;
6073  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6074  KA_TRACE(10,
6075  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6076  gtid));
6077  return;
6078  } else {
6079  KA_TRACE(
6080  10,
6081  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6082  __kmp_unregister_root_current_thread(gtid);
6083  }
6084  } else {
6085 /* worker threads may call this function through the atexit handler, if they
6086  * call exit() */
6087 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6088  TODO: do a thorough shutdown instead */
6089 #ifdef DUMP_DEBUG_ON_EXIT
6090  if (__kmp_debug_buf)
6091  __kmp_dump_debug_buffer();
6092 #endif
6093  return;
6094  }
6095  }
6096  /* synchronize the termination process */
6097  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6098 
6099  /* have we already finished */
6100  if (__kmp_global.g.g_abort) {
6101  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6102  /* TODO abort? */
6103  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6104  return;
6105  }
6106  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6107  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6108  return;
6109  }
6110 
6111  /* We need this lock to enforce mutex between this reading of
6112  __kmp_threads_capacity and the writing by __kmp_register_root.
6113  Alternatively, we can use a counter of roots that is atomically updated by
6114  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6115  __kmp_internal_end_*. */
6116  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6117 
6118  /* now we can safely conduct the actual termination */
6119  __kmp_internal_end();
6120 
6121  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6122  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6123 
6124  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6125 
6126 #ifdef DUMP_DEBUG_ON_EXIT
6127  if (__kmp_debug_buf)
6128  __kmp_dump_debug_buffer();
6129 #endif
6130 
6131 #if KMP_OS_WINDOWS
6132  __kmp_close_console();
6133 #endif
6134 
6135  __kmp_fini_allocator();
6136 
6137 } // __kmp_internal_end_library
6138 
6139 void __kmp_internal_end_thread(int gtid_req) {
6140  int i;
6141 
6142  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6143  /* this shouldn't be a race condition because __kmp_internal_end() is the
6144  * only place to clear __kmp_serial_init */
6145  /* we'll check this later too, after we get the lock */
6146  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6147  // redundant, because the next check will work in any case.
6148  if (__kmp_global.g.g_abort) {
6149  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6150  /* TODO abort? */
6151  return;
6152  }
6153  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6154  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6155  return;
6156  }
6157 
6158  KMP_MB(); /* Flush all pending memory write invalidates. */
6159 
6160  /* find out who we are and what we should do */
6161  {
6162  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6163  KA_TRACE(10,
6164  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6165  if (gtid == KMP_GTID_SHUTDOWN) {
6166  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6167  "already shutdown\n"));
6168  return;
6169  } else if (gtid == KMP_GTID_MONITOR) {
6170  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6171  "registered, or system shutdown\n"));
6172  return;
6173  } else if (gtid == KMP_GTID_DNE) {
6174  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6175  "shutdown\n"));
6176  return;
6177  /* we don't know who we are */
6178  } else if (KMP_UBER_GTID(gtid)) {
6179  /* unregister ourselves as an uber thread. gtid is no longer valid */
6180  if (__kmp_root[gtid]->r.r_active) {
6181  __kmp_global.g.g_abort = -1;
6182  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6183  KA_TRACE(10,
6184  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6185  gtid));
6186  return;
6187  } else {
6188  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6189  gtid));
6190  __kmp_unregister_root_current_thread(gtid);
6191  }
6192  } else {
6193  /* just a worker thread, let's leave */
6194  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6195 
6196  if (gtid >= 0) {
6197  __kmp_threads[gtid]->th.th_task_team = NULL;
6198  }
6199 
6200  KA_TRACE(10,
6201  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6202  gtid));
6203  return;
6204  }
6205  }
6206 #if KMP_DYNAMIC_LIB
6207  if (__kmp_pause_status != kmp_hard_paused)
6208  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6209  // because we will better shutdown later in the library destructor.
6210  {
6211  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6212  return;
6213  }
6214 #endif
6215  /* synchronize the termination process */
6216  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6217 
6218  /* have we already finished */
6219  if (__kmp_global.g.g_abort) {
6220  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6221  /* TODO abort? */
6222  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6223  return;
6224  }
6225  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6226  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6227  return;
6228  }
6229 
6230  /* We need this lock to enforce mutex between this reading of
6231  __kmp_threads_capacity and the writing by __kmp_register_root.
6232  Alternatively, we can use a counter of roots that is atomically updated by
6233  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6234  __kmp_internal_end_*. */
6235 
6236  /* should we finish the run-time? are all siblings done? */
6237  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6238 
6239  for (i = 0; i < __kmp_threads_capacity; ++i) {
6240  if (KMP_UBER_GTID(i)) {
6241  KA_TRACE(
6242  10,
6243  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6244  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6245  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6246  return;
6247  }
6248  }
6249 
6250  /* now we can safely conduct the actual termination */
6251 
6252  __kmp_internal_end();
6253 
6254  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6255  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6256 
6257  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6258 
6259 #ifdef DUMP_DEBUG_ON_EXIT
6260  if (__kmp_debug_buf)
6261  __kmp_dump_debug_buffer();
6262 #endif
6263 } // __kmp_internal_end_thread
6264 
6265 // -----------------------------------------------------------------------------
6266 // Library registration stuff.
6267 
6268 static long __kmp_registration_flag = 0;
6269 // Random value used to indicate library initialization.
6270 static char *__kmp_registration_str = NULL;
6271 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6272 
6273 static inline char *__kmp_reg_status_name() {
6274  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6275  each thread. If registration and unregistration go in different threads
6276  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6277  env var can not be found, because the name will contain different pid. */
6278  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6279 } // __kmp_reg_status_get
6280 
6281 void __kmp_register_library_startup(void) {
6282 
6283  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6284  int done = 0;
6285  union {
6286  double dtime;
6287  long ltime;
6288  } time;
6289 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6290  __kmp_initialize_system_tick();
6291 #endif
6292  __kmp_read_system_time(&time.dtime);
6293  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6294  __kmp_registration_str =
6295  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6296  __kmp_registration_flag, KMP_LIBRARY_FILE);
6297 
6298  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6299  __kmp_registration_str));
6300 
6301  while (!done) {
6302 
6303  char *value = NULL; // Actual value of the environment variable.
6304 
6305  // Set environment variable, but do not overwrite if it is exist.
6306  __kmp_env_set(name, __kmp_registration_str, 0);
6307  // Check the variable is written.
6308  value = __kmp_env_get(name);
6309  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6310 
6311  done = 1; // Ok, environment variable set successfully, exit the loop.
6312 
6313  } else {
6314 
6315  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6316  // Check whether it alive or dead.
6317  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6318  char *tail = value;
6319  char *flag_addr_str = NULL;
6320  char *flag_val_str = NULL;
6321  char const *file_name = NULL;
6322  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6323  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6324  file_name = tail;
6325  if (tail != NULL) {
6326  long *flag_addr = 0;
6327  long flag_val = 0;
6328  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6329  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6330  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6331  // First, check whether environment-encoded address is mapped into
6332  // addr space.
6333  // If so, dereference it to see if it still has the right value.
6334  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6335  neighbor = 1;
6336  } else {
6337  // If not, then we know the other copy of the library is no longer
6338  // running.
6339  neighbor = 2;
6340  }
6341  }
6342  }
6343  switch (neighbor) {
6344  case 0: // Cannot parse environment variable -- neighbor status unknown.
6345  // Assume it is the incompatible format of future version of the
6346  // library. Assume the other library is alive.
6347  // WARN( ... ); // TODO: Issue a warning.
6348  file_name = "unknown library";
6349  KMP_FALLTHROUGH();
6350  // Attention! Falling to the next case. That's intentional.
6351  case 1: { // Neighbor is alive.
6352  // Check it is allowed.
6353  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6354  if (!__kmp_str_match_true(duplicate_ok)) {
6355  // That's not allowed. Issue fatal error.
6356  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6357  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6358  }
6359  KMP_INTERNAL_FREE(duplicate_ok);
6360  __kmp_duplicate_library_ok = 1;
6361  done = 1; // Exit the loop.
6362  } break;
6363  case 2: { // Neighbor is dead.
6364  // Clear the variable and try to register library again.
6365  __kmp_env_unset(name);
6366  } break;
6367  default: { KMP_DEBUG_ASSERT(0); } break;
6368  }
6369  }
6370  KMP_INTERNAL_FREE((void *)value);
6371  }
6372  KMP_INTERNAL_FREE((void *)name);
6373 
6374 } // func __kmp_register_library_startup
6375 
6376 void __kmp_unregister_library(void) {
6377 
6378  char *name = __kmp_reg_status_name();
6379  char *value = __kmp_env_get(name);
6380 
6381  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6382  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6383  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6384  // Ok, this is our variable. Delete it.
6385  __kmp_env_unset(name);
6386  }
6387 
6388  KMP_INTERNAL_FREE(__kmp_registration_str);
6389  KMP_INTERNAL_FREE(value);
6390  KMP_INTERNAL_FREE(name);
6391 
6392  __kmp_registration_flag = 0;
6393  __kmp_registration_str = NULL;
6394 
6395 } // __kmp_unregister_library
6396 
6397 // End of Library registration stuff.
6398 // -----------------------------------------------------------------------------
6399 
6400 #if KMP_MIC_SUPPORTED
6401 
6402 static void __kmp_check_mic_type() {
6403  kmp_cpuid_t cpuid_state = {0};
6404  kmp_cpuid_t *cs_p = &cpuid_state;
6405  __kmp_x86_cpuid(1, 0, cs_p);
6406  // We don't support mic1 at the moment
6407  if ((cs_p->eax & 0xff0) == 0xB10) {
6408  __kmp_mic_type = mic2;
6409  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6410  __kmp_mic_type = mic3;
6411  } else {
6412  __kmp_mic_type = non_mic;
6413  }
6414 }
6415 
6416 #endif /* KMP_MIC_SUPPORTED */
6417 
6418 static void __kmp_do_serial_initialize(void) {
6419  int i, gtid;
6420  int size;
6421 
6422  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6423 
6424  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6425  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6426  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6427  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6428  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6429 
6430 #if OMPT_SUPPORT
6431  ompt_pre_init();
6432 #endif
6433 
6434  __kmp_validate_locks();
6435 
6436  /* Initialize internal memory allocator */
6437  __kmp_init_allocator();
6438 
6439  /* Register the library startup via an environment variable and check to see
6440  whether another copy of the library is already registered. */
6441 
6442  __kmp_register_library_startup();
6443 
6444  /* TODO reinitialization of library */
6445  if (TCR_4(__kmp_global.g.g_done)) {
6446  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6447  }
6448 
6449  __kmp_global.g.g_abort = 0;
6450  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6451 
6452 /* initialize the locks */
6453 #if KMP_USE_ADAPTIVE_LOCKS
6454 #if KMP_DEBUG_ADAPTIVE_LOCKS
6455  __kmp_init_speculative_stats();
6456 #endif
6457 #endif
6458 #if KMP_STATS_ENABLED
6459  __kmp_stats_init();
6460 #endif
6461  __kmp_init_lock(&__kmp_global_lock);
6462  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6463  __kmp_init_lock(&__kmp_debug_lock);
6464  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6465  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6466  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6467  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6468  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6469  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6470  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6471  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6472  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6473  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6474  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6475  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6476  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6477  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6478  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6479 #if KMP_USE_MONITOR
6480  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6481 #endif
6482  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6483 
6484  /* conduct initialization and initial setup of configuration */
6485 
6486  __kmp_runtime_initialize();
6487 
6488 #if KMP_MIC_SUPPORTED
6489  __kmp_check_mic_type();
6490 #endif
6491 
6492 // Some global variable initialization moved here from kmp_env_initialize()
6493 #ifdef KMP_DEBUG
6494  kmp_diag = 0;
6495 #endif
6496  __kmp_abort_delay = 0;
6497 
6498  // From __kmp_init_dflt_team_nth()
6499  /* assume the entire machine will be used */
6500  __kmp_dflt_team_nth_ub = __kmp_xproc;
6501  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6502  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6503  }
6504  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6505  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6506  }
6507  __kmp_max_nth = __kmp_sys_max_nth;
6508  __kmp_cg_max_nth = __kmp_sys_max_nth;
6509  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6510  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6511  __kmp_teams_max_nth = __kmp_sys_max_nth;
6512  }
6513 
6514  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6515  // part
6516  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6517 #if KMP_USE_MONITOR
6518  __kmp_monitor_wakeups =
6519  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6520  __kmp_bt_intervals =
6521  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6522 #endif
6523  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6524  __kmp_library = library_throughput;
6525  // From KMP_SCHEDULE initialization
6526  __kmp_static = kmp_sch_static_balanced;
6527 // AC: do not use analytical here, because it is non-monotonous
6528 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6529 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6530 // need to repeat assignment
6531 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6532 // bit control and barrier method control parts
6533 #if KMP_FAST_REDUCTION_BARRIER
6534 #define kmp_reduction_barrier_gather_bb ((int)1)
6535 #define kmp_reduction_barrier_release_bb ((int)1)
6536 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6537 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6538 #endif // KMP_FAST_REDUCTION_BARRIER
6539  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6540  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6541  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6542  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6543  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6544 #if KMP_FAST_REDUCTION_BARRIER
6545  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6546  // lin_64 ): hyper,1
6547  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6548  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6549  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6550  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6551  }
6552 #endif // KMP_FAST_REDUCTION_BARRIER
6553  }
6554 #if KMP_FAST_REDUCTION_BARRIER
6555 #undef kmp_reduction_barrier_release_pat
6556 #undef kmp_reduction_barrier_gather_pat
6557 #undef kmp_reduction_barrier_release_bb
6558 #undef kmp_reduction_barrier_gather_bb
6559 #endif // KMP_FAST_REDUCTION_BARRIER
6560 #if KMP_MIC_SUPPORTED
6561  if (__kmp_mic_type == mic2) { // KNC
6562  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6563  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6564  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6565  1; // forkjoin release
6566  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6567  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6568  }
6569 #if KMP_FAST_REDUCTION_BARRIER
6570  if (__kmp_mic_type == mic2) { // KNC
6571  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6572  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6573  }
6574 #endif // KMP_FAST_REDUCTION_BARRIER
6575 #endif // KMP_MIC_SUPPORTED
6576 
6577 // From KMP_CHECKS initialization
6578 #ifdef KMP_DEBUG
6579  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6580 #else
6581  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6582 #endif
6583 
6584  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6585  __kmp_foreign_tp = TRUE;
6586 
6587  __kmp_global.g.g_dynamic = FALSE;
6588  __kmp_global.g.g_dynamic_mode = dynamic_default;
6589 
6590  __kmp_env_initialize(NULL);
6591 
6592 // Print all messages in message catalog for testing purposes.
6593 #ifdef KMP_DEBUG
6594  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6595  if (__kmp_str_match_true(val)) {
6596  kmp_str_buf_t buffer;
6597  __kmp_str_buf_init(&buffer);
6598  __kmp_i18n_dump_catalog(&buffer);
6599  __kmp_printf("%s", buffer.str);
6600  __kmp_str_buf_free(&buffer);
6601  }
6602  __kmp_env_free(&val);
6603 #endif
6604 
6605  __kmp_threads_capacity =
6606  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6607  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6608  __kmp_tp_capacity = __kmp_default_tp_capacity(
6609  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6610 
6611  // If the library is shut down properly, both pools must be NULL. Just in
6612  // case, set them to NULL -- some memory may leak, but subsequent code will
6613  // work even if pools are not freed.
6614  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6615  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6616  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6617  __kmp_thread_pool = NULL;
6618  __kmp_thread_pool_insert_pt = NULL;
6619  __kmp_team_pool = NULL;
6620 
6621  /* Allocate all of the variable sized records */
6622  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6623  * expandable */
6624  /* Since allocation is cache-aligned, just add extra padding at the end */
6625  size =
6626  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6627  CACHE_LINE;
6628  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6629  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6630  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6631 
6632  /* init thread counts */
6633  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6634  0); // Asserts fail if the library is reinitializing and
6635  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6636  __kmp_all_nth = 0;
6637  __kmp_nth = 0;
6638 
6639  /* setup the uber master thread and hierarchy */
6640  gtid = __kmp_register_root(TRUE);
6641  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6642  KMP_ASSERT(KMP_UBER_GTID(gtid));
6643  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6644 
6645  KMP_MB(); /* Flush all pending memory write invalidates. */
6646 
6647  __kmp_common_initialize();
6648 
6649 #if KMP_OS_UNIX
6650  /* invoke the child fork handler */
6651  __kmp_register_atfork();
6652 #endif
6653 
6654 #if !KMP_DYNAMIC_LIB
6655  {
6656  /* Invoke the exit handler when the program finishes, only for static
6657  library. For dynamic library, we already have _fini and DllMain. */
6658  int rc = atexit(__kmp_internal_end_atexit);
6659  if (rc != 0) {
6660  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6661  __kmp_msg_null);
6662  }
6663  }
6664 #endif
6665 
6666 #if KMP_HANDLE_SIGNALS
6667 #if KMP_OS_UNIX
6668  /* NOTE: make sure that this is called before the user installs their own
6669  signal handlers so that the user handlers are called first. this way they
6670  can return false, not call our handler, avoid terminating the library, and
6671  continue execution where they left off. */
6672  __kmp_install_signals(FALSE);
6673 #endif /* KMP_OS_UNIX */
6674 #if KMP_OS_WINDOWS
6675  __kmp_install_signals(TRUE);
6676 #endif /* KMP_OS_WINDOWS */
6677 #endif
6678 
6679  /* we have finished the serial initialization */
6680  __kmp_init_counter++;
6681 
6682  __kmp_init_serial = TRUE;
6683 
6684  if (__kmp_settings) {
6685  __kmp_env_print();
6686  }
6687 
6688  if (__kmp_display_env || __kmp_display_env_verbose) {
6689  __kmp_env_print_2();
6690  }
6691 
6692 #if OMPT_SUPPORT
6693  ompt_post_init();
6694 #endif
6695 
6696  KMP_MB();
6697 
6698  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6699 }
6700 
6701 void __kmp_serial_initialize(void) {
6702  if (__kmp_init_serial) {
6703  return;
6704  }
6705  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6706  if (__kmp_init_serial) {
6707  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6708  return;
6709  }
6710  __kmp_do_serial_initialize();
6711  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6712 }
6713 
6714 static void __kmp_do_middle_initialize(void) {
6715  int i, j;
6716  int prev_dflt_team_nth;
6717 
6718  if (!__kmp_init_serial) {
6719  __kmp_do_serial_initialize();
6720  }
6721 
6722  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6723 
6724  // Save the previous value for the __kmp_dflt_team_nth so that
6725  // we can avoid some reinitialization if it hasn't changed.
6726  prev_dflt_team_nth = __kmp_dflt_team_nth;
6727 
6728 #if KMP_AFFINITY_SUPPORTED
6729  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6730  // number of cores on the machine.
6731  __kmp_affinity_initialize();
6732 
6733  // Run through the __kmp_threads array and set the affinity mask
6734  // for each root thread that is currently registered with the RTL.
6735  for (i = 0; i < __kmp_threads_capacity; i++) {
6736  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6737  __kmp_affinity_set_init_mask(i, TRUE);
6738  }
6739  }
6740 #endif /* KMP_AFFINITY_SUPPORTED */
6741 
6742  KMP_ASSERT(__kmp_xproc > 0);
6743  if (__kmp_avail_proc == 0) {
6744  __kmp_avail_proc = __kmp_xproc;
6745  }
6746 
6747  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6748  // correct them now
6749  j = 0;
6750  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6751  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6752  __kmp_avail_proc;
6753  j++;
6754  }
6755 
6756  if (__kmp_dflt_team_nth == 0) {
6757 #ifdef KMP_DFLT_NTH_CORES
6758  // Default #threads = #cores
6759  __kmp_dflt_team_nth = __kmp_ncores;
6760  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6761  "__kmp_ncores (%d)\n",
6762  __kmp_dflt_team_nth));
6763 #else
6764  // Default #threads = #available OS procs
6765  __kmp_dflt_team_nth = __kmp_avail_proc;
6766  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6767  "__kmp_avail_proc(%d)\n",
6768  __kmp_dflt_team_nth));
6769 #endif /* KMP_DFLT_NTH_CORES */
6770  }
6771 
6772  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6773  __kmp_dflt_team_nth = KMP_MIN_NTH;
6774  }
6775  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6776  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6777  }
6778 
6779  // There's no harm in continuing if the following check fails,
6780  // but it indicates an error in the previous logic.
6781  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6782 
6783  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6784  // Run through the __kmp_threads array and set the num threads icv for each
6785  // root thread that is currently registered with the RTL (which has not
6786  // already explicitly set its nthreads-var with a call to
6787  // omp_set_num_threads()).
6788  for (i = 0; i < __kmp_threads_capacity; i++) {
6789  kmp_info_t *thread = __kmp_threads[i];
6790  if (thread == NULL)
6791  continue;
6792  if (thread->th.th_current_task->td_icvs.nproc != 0)
6793  continue;
6794 
6795  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6796  }
6797  }
6798  KA_TRACE(
6799  20,
6800  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6801  __kmp_dflt_team_nth));
6802 
6803 #ifdef KMP_ADJUST_BLOCKTIME
6804  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6805  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6806  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6807  if (__kmp_nth > __kmp_avail_proc) {
6808  __kmp_zero_bt = TRUE;
6809  }
6810  }
6811 #endif /* KMP_ADJUST_BLOCKTIME */
6812 
6813  /* we have finished middle initialization */
6814  TCW_SYNC_4(__kmp_init_middle, TRUE);
6815 
6816  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6817 }
6818 
6819 void __kmp_middle_initialize(void) {
6820  if (__kmp_init_middle) {
6821  return;
6822  }
6823  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6824  if (__kmp_init_middle) {
6825  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6826  return;
6827  }
6828  __kmp_do_middle_initialize();
6829  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6830 }
6831 
6832 void __kmp_parallel_initialize(void) {
6833  int gtid = __kmp_entry_gtid(); // this might be a new root
6834 
6835  /* synchronize parallel initialization (for sibling) */
6836  if (TCR_4(__kmp_init_parallel))
6837  return;
6838  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6839  if (TCR_4(__kmp_init_parallel)) {
6840  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6841  return;
6842  }
6843 
6844  /* TODO reinitialization after we have already shut down */
6845  if (TCR_4(__kmp_global.g.g_done)) {
6846  KA_TRACE(
6847  10,
6848  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6849  __kmp_infinite_loop();
6850  }
6851 
6852  /* jc: The lock __kmp_initz_lock is already held, so calling
6853  __kmp_serial_initialize would cause a deadlock. So we call
6854  __kmp_do_serial_initialize directly. */
6855  if (!__kmp_init_middle) {
6856  __kmp_do_middle_initialize();
6857  }
6858  __kmp_resume_if_hard_paused();
6859 
6860  /* begin initialization */
6861  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6862  KMP_ASSERT(KMP_UBER_GTID(gtid));
6863 
6864 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6865  // Save the FP control regs.
6866  // Worker threads will set theirs to these values at thread startup.
6867  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6868  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6869  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6870 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6871 
6872 #if KMP_OS_UNIX
6873 #if KMP_HANDLE_SIGNALS
6874  /* must be after __kmp_serial_initialize */
6875  __kmp_install_signals(TRUE);
6876 #endif
6877 #endif
6878 
6879  __kmp_suspend_initialize();
6880 
6881 #if defined(USE_LOAD_BALANCE)
6882  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6883  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6884  }
6885 #else
6886  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6887  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6888  }
6889 #endif
6890 
6891  if (__kmp_version) {
6892  __kmp_print_version_2();
6893  }
6894 
6895  /* we have finished parallel initialization */
6896  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6897 
6898  KMP_MB();
6899  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6900 
6901  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6902 }
6903 
6904 /* ------------------------------------------------------------------------ */
6905 
6906 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6907  kmp_team_t *team) {
6908  kmp_disp_t *dispatch;
6909 
6910  KMP_MB();
6911 
6912  /* none of the threads have encountered any constructs, yet. */
6913  this_thr->th.th_local.this_construct = 0;
6914 #if KMP_CACHE_MANAGE
6915  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6916 #endif /* KMP_CACHE_MANAGE */
6917  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6918  KMP_DEBUG_ASSERT(dispatch);
6919  KMP_DEBUG_ASSERT(team->t.t_dispatch);
6920  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6921  // this_thr->th.th_info.ds.ds_tid ] );
6922 
6923  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6924  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
6925  if (__kmp_env_consistency_check)
6926  __kmp_push_parallel(gtid, team->t.t_ident);
6927 
6928  KMP_MB(); /* Flush all pending memory write invalidates. */
6929 }
6930 
6931 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6932  kmp_team_t *team) {
6933  if (__kmp_env_consistency_check)
6934  __kmp_pop_parallel(gtid, team->t.t_ident);
6935 
6936  __kmp_finish_implicit_task(this_thr);
6937 }
6938 
6939 int __kmp_invoke_task_func(int gtid) {
6940  int rc;
6941  int tid = __kmp_tid_from_gtid(gtid);
6942  kmp_info_t *this_thr = __kmp_threads[gtid];
6943  kmp_team_t *team = this_thr->th.th_team;
6944 
6945  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6946 #if USE_ITT_BUILD
6947  if (__itt_stack_caller_create_ptr) {
6948  __kmp_itt_stack_callee_enter(
6949  (__itt_caller)
6950  team->t.t_stack_id); // inform ittnotify about entering user's code
6951  }
6952 #endif /* USE_ITT_BUILD */
6953 #if INCLUDE_SSC_MARKS
6954  SSC_MARK_INVOKING();
6955 #endif
6956 
6957 #if OMPT_SUPPORT
6958  void *dummy;
6959  void **exit_runtime_p;
6960  ompt_data_t *my_task_data;
6961  ompt_data_t *my_parallel_data;
6962  int ompt_team_size;
6963 
6964  if (ompt_enabled.enabled) {
6965  exit_runtime_p = &(
6966  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
6967  } else {
6968  exit_runtime_p = &dummy;
6969  }
6970 
6971  my_task_data =
6972  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6973  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6974  if (ompt_enabled.ompt_callback_implicit_task) {
6975  ompt_team_size = team->t.t_nproc;
6976  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6977  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6978  __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
6979  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
6980  }
6981 #endif
6982 
6983 #if KMP_STATS_ENABLED
6984  stats_state_e previous_state = KMP_GET_THREAD_STATE();
6985  if (previous_state == stats_state_e::TEAMS_REGION) {
6986  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
6987  } else {
6988  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
6989  }
6990  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
6991 #endif
6992 
6993  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6994  tid, (int)team->t.t_argc, (void **)team->t.t_argv
6995 #if OMPT_SUPPORT
6996  ,
6997  exit_runtime_p
6998 #endif
6999  );
7000 #if OMPT_SUPPORT
7001  *exit_runtime_p = NULL;
7002 #endif
7003 
7004 #if KMP_STATS_ENABLED
7005  if (previous_state == stats_state_e::TEAMS_REGION) {
7006  KMP_SET_THREAD_STATE(previous_state);
7007  }
7008  KMP_POP_PARTITIONED_TIMER();
7009 #endif
7010 
7011 #if USE_ITT_BUILD
7012  if (__itt_stack_caller_create_ptr) {
7013  __kmp_itt_stack_callee_leave(
7014  (__itt_caller)
7015  team->t.t_stack_id); // inform ittnotify about leaving user's code
7016  }
7017 #endif /* USE_ITT_BUILD */
7018  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7019 
7020  return rc;
7021 }
7022 
7023 void __kmp_teams_master(int gtid) {
7024  // This routine is called by all master threads in teams construct
7025  kmp_info_t *thr = __kmp_threads[gtid];
7026  kmp_team_t *team = thr->th.th_team;
7027  ident_t *loc = team->t.t_ident;
7028  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7029  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7030  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7031  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7032  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7033 
7034  // This thread is a new CG root. Set up the proper variables.
7035  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7036  tmp->cg_root = thr; // Make thr the CG root
7037  // Init to thread limit that was stored when league masters were forked
7038  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7039  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7040  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7041  " cg_nthreads to 1\n",
7042  thr, tmp));
7043  tmp->up = thr->th.th_cg_roots;
7044  thr->th.th_cg_roots = tmp;
7045 
7046 // Launch league of teams now, but not let workers execute
7047 // (they hang on fork barrier until next parallel)
7048 #if INCLUDE_SSC_MARKS
7049  SSC_MARK_FORKING();
7050 #endif
7051  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7052  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7053  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7054 #if INCLUDE_SSC_MARKS
7055  SSC_MARK_JOINING();
7056 #endif
7057  // If the team size was reduced from the limit, set it to the new size
7058  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7059  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7060  // AC: last parameter "1" eliminates join barrier which won't work because
7061  // worker threads are in a fork barrier waiting for more parallel regions
7062  __kmp_join_call(loc, gtid
7063 #if OMPT_SUPPORT
7064  ,
7065  fork_context_intel
7066 #endif
7067  ,
7068  1);
7069 }
7070 
7071 int __kmp_invoke_teams_master(int gtid) {
7072  kmp_info_t *this_thr = __kmp_threads[gtid];
7073  kmp_team_t *team = this_thr->th.th_team;
7074 #if KMP_DEBUG
7075  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7076  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7077  (void *)__kmp_teams_master);
7078 #endif
7079  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7080  __kmp_teams_master(gtid);
7081  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7082  return 1;
7083 }
7084 
7085 /* this sets the requested number of threads for the next parallel region
7086  encountered by this team. since this should be enclosed in the forkjoin
7087  critical section it should avoid race conditions with assymmetrical nested
7088  parallelism */
7089 
7090 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7091  kmp_info_t *thr = __kmp_threads[gtid];
7092 
7093  if (num_threads > 0)
7094  thr->th.th_set_nproc = num_threads;
7095 }
7096 
7097 /* this sets the requested number of teams for the teams region and/or
7098  the number of threads for the next parallel region encountered */
7099 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7100  int num_threads) {
7101  kmp_info_t *thr = __kmp_threads[gtid];
7102  KMP_DEBUG_ASSERT(num_teams >= 0);
7103  KMP_DEBUG_ASSERT(num_threads >= 0);
7104 
7105  if (num_teams == 0)
7106  num_teams = 1; // default number of teams is 1.
7107  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7108  if (!__kmp_reserve_warn) {
7109  __kmp_reserve_warn = 1;
7110  __kmp_msg(kmp_ms_warning,
7111  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7112  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7113  }
7114  num_teams = __kmp_teams_max_nth;
7115  }
7116  // Set number of teams (number of threads in the outer "parallel" of the
7117  // teams)
7118  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7119 
7120  // Remember the number of threads for inner parallel regions
7121  if (num_threads == 0) {
7122  if (!TCR_4(__kmp_init_middle))
7123  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7124  num_threads = __kmp_avail_proc / num_teams;
7125  if (num_teams * num_threads > __kmp_teams_max_nth) {
7126  // adjust num_threads w/o warning as it is not user setting
7127  num_threads = __kmp_teams_max_nth / num_teams;
7128  }
7129  } else {
7130  // This thread will be the master of the league masters
7131  // Store new thread limit; old limit is saved in th_cg_roots list
7132  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7133 
7134  if (num_teams * num_threads > __kmp_teams_max_nth) {
7135  int new_threads = __kmp_teams_max_nth / num_teams;
7136  if (!__kmp_reserve_warn) { // user asked for too many threads
7137  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7138  __kmp_msg(kmp_ms_warning,
7139  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7140  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7141  }
7142  num_threads = new_threads;
7143  }
7144  }
7145  thr->th.th_teams_size.nth = num_threads;
7146 }
7147 
7148 // Set the proc_bind var to use in the following parallel region.
7149 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7150  kmp_info_t *thr = __kmp_threads[gtid];
7151  thr->th.th_set_proc_bind = proc_bind;
7152 }
7153 
7154 /* Launch the worker threads into the microtask. */
7155 
7156 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7157  kmp_info_t *this_thr = __kmp_threads[gtid];
7158 
7159 #ifdef KMP_DEBUG
7160  int f;
7161 #endif /* KMP_DEBUG */
7162 
7163  KMP_DEBUG_ASSERT(team);
7164  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7165  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7166  KMP_MB(); /* Flush all pending memory write invalidates. */
7167 
7168  team->t.t_construct = 0; /* no single directives seen yet */
7169  team->t.t_ordered.dt.t_value =
7170  0; /* thread 0 enters the ordered section first */
7171 
7172  /* Reset the identifiers on the dispatch buffer */
7173  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7174  if (team->t.t_max_nproc > 1) {
7175  int i;
7176  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7177  team->t.t_disp_buffer[i].buffer_index = i;
7178  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7179  }
7180  } else {
7181  team->t.t_disp_buffer[0].buffer_index = 0;
7182  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7183  }
7184 
7185  KMP_MB(); /* Flush all pending memory write invalidates. */
7186  KMP_ASSERT(this_thr->th.th_team == team);
7187 
7188 #ifdef KMP_DEBUG
7189  for (f = 0; f < team->t.t_nproc; f++) {
7190  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7191  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7192  }
7193 #endif /* KMP_DEBUG */
7194 
7195  /* release the worker threads so they may begin working */
7196  __kmp_fork_barrier(gtid, 0);
7197 }
7198 
7199 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7200  kmp_info_t *this_thr = __kmp_threads[gtid];
7201 
7202  KMP_DEBUG_ASSERT(team);
7203  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7204  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7205  KMP_MB(); /* Flush all pending memory write invalidates. */
7206 
7207 /* Join barrier after fork */
7208 
7209 #ifdef KMP_DEBUG
7210  if (__kmp_threads[gtid] &&
7211  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7212  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7213  __kmp_threads[gtid]);
7214  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7215  "team->t.t_nproc=%d\n",
7216  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7217  team->t.t_nproc);
7218  __kmp_print_structure();
7219  }
7220  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7221  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7222 #endif /* KMP_DEBUG */
7223 
7224  __kmp_join_barrier(gtid); /* wait for everyone */
7225 #if OMPT_SUPPORT
7226  if (ompt_enabled.enabled &&
7227  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7228  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7229  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7230  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7231 #if OMPT_OPTIONAL
7232  void *codeptr = NULL;
7233  if (KMP_MASTER_TID(ds_tid) &&
7234  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7235  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7236  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7237 
7238  if (ompt_enabled.ompt_callback_sync_region_wait) {
7239  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7240  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7241  codeptr);
7242  }
7243  if (ompt_enabled.ompt_callback_sync_region) {
7244  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7245  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7246  codeptr);
7247  }
7248 #endif
7249  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7250  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7251  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7252  }
7253  }
7254 #endif
7255 
7256  KMP_MB(); /* Flush all pending memory write invalidates. */
7257  KMP_ASSERT(this_thr->th.th_team == team);
7258 }
7259 
7260 /* ------------------------------------------------------------------------ */
7261 
7262 #ifdef USE_LOAD_BALANCE
7263 
7264 // Return the worker threads actively spinning in the hot team, if we
7265 // are at the outermost level of parallelism. Otherwise, return 0.
7266 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7267  int i;
7268  int retval;
7269  kmp_team_t *hot_team;
7270 
7271  if (root->r.r_active) {
7272  return 0;
7273  }
7274  hot_team = root->r.r_hot_team;
7275  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7276  return hot_team->t.t_nproc - 1; // Don't count master thread
7277  }
7278 
7279  // Skip the master thread - it is accounted for elsewhere.
7280  retval = 0;
7281  for (i = 1; i < hot_team->t.t_nproc; i++) {
7282  if (hot_team->t.t_threads[i]->th.th_active) {
7283  retval++;
7284  }
7285  }
7286  return retval;
7287 }
7288 
7289 // Perform an automatic adjustment to the number of
7290 // threads used by the next parallel region.
7291 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7292  int retval;
7293  int pool_active;
7294  int hot_team_active;
7295  int team_curr_active;
7296  int system_active;
7297 
7298  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7299  set_nproc));
7300  KMP_DEBUG_ASSERT(root);
7301  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7302  ->th.th_current_task->td_icvs.dynamic == TRUE);
7303  KMP_DEBUG_ASSERT(set_nproc > 1);
7304 
7305  if (set_nproc == 1) {
7306  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7307  return 1;
7308  }
7309 
7310  // Threads that are active in the thread pool, active in the hot team for this
7311  // particular root (if we are at the outer par level), and the currently
7312  // executing thread (to become the master) are available to add to the new
7313  // team, but are currently contributing to the system load, and must be
7314  // accounted for.
7315  pool_active = __kmp_thread_pool_active_nth;
7316  hot_team_active = __kmp_active_hot_team_nproc(root);
7317  team_curr_active = pool_active + hot_team_active + 1;
7318 
7319  // Check the system load.
7320  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7321  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7322  "hot team active = %d\n",
7323  system_active, pool_active, hot_team_active));
7324 
7325  if (system_active < 0) {
7326  // There was an error reading the necessary info from /proc, so use the
7327  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7328  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7329  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7330  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7331 
7332  // Make this call behave like the thread limit algorithm.
7333  retval = __kmp_avail_proc - __kmp_nth +
7334  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7335  if (retval > set_nproc) {
7336  retval = set_nproc;
7337  }
7338  if (retval < KMP_MIN_NTH) {
7339  retval = KMP_MIN_NTH;
7340  }
7341 
7342  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7343  retval));
7344  return retval;
7345  }
7346 
7347  // There is a slight delay in the load balance algorithm in detecting new
7348  // running procs. The real system load at this instant should be at least as
7349  // large as the #active omp thread that are available to add to the team.
7350  if (system_active < team_curr_active) {
7351  system_active = team_curr_active;
7352  }
7353  retval = __kmp_avail_proc - system_active + team_curr_active;
7354  if (retval > set_nproc) {
7355  retval = set_nproc;
7356  }
7357  if (retval < KMP_MIN_NTH) {
7358  retval = KMP_MIN_NTH;
7359  }
7360 
7361  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7362  return retval;
7363 } // __kmp_load_balance_nproc()
7364 
7365 #endif /* USE_LOAD_BALANCE */
7366 
7367 /* ------------------------------------------------------------------------ */
7368 
7369 /* NOTE: this is called with the __kmp_init_lock held */
7370 void __kmp_cleanup(void) {
7371  int f;
7372 
7373  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7374 
7375  if (TCR_4(__kmp_init_parallel)) {
7376 #if KMP_HANDLE_SIGNALS
7377  __kmp_remove_signals();
7378 #endif
7379  TCW_4(__kmp_init_parallel, FALSE);
7380  }
7381 
7382  if (TCR_4(__kmp_init_middle)) {
7383 #if KMP_AFFINITY_SUPPORTED
7384  __kmp_affinity_uninitialize();
7385 #endif /* KMP_AFFINITY_SUPPORTED */
7386  __kmp_cleanup_hierarchy();
7387  TCW_4(__kmp_init_middle, FALSE);
7388  }
7389 
7390  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7391 
7392  if (__kmp_init_serial) {
7393  __kmp_runtime_destroy();
7394  __kmp_init_serial = FALSE;
7395  }
7396 
7397  __kmp_cleanup_threadprivate_caches();
7398 
7399  for (f = 0; f < __kmp_threads_capacity; f++) {
7400  if (__kmp_root[f] != NULL) {
7401  __kmp_free(__kmp_root[f]);
7402  __kmp_root[f] = NULL;
7403  }
7404  }
7405  __kmp_free(__kmp_threads);
7406  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7407  // there is no need in freeing __kmp_root.
7408  __kmp_threads = NULL;
7409  __kmp_root = NULL;
7410  __kmp_threads_capacity = 0;
7411 
7412 #if KMP_USE_DYNAMIC_LOCK
7413  __kmp_cleanup_indirect_user_locks();
7414 #else
7415  __kmp_cleanup_user_locks();
7416 #endif
7417 
7418 #if KMP_AFFINITY_SUPPORTED
7419  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7420  __kmp_cpuinfo_file = NULL;
7421 #endif /* KMP_AFFINITY_SUPPORTED */
7422 
7423 #if KMP_USE_ADAPTIVE_LOCKS
7424 #if KMP_DEBUG_ADAPTIVE_LOCKS
7425  __kmp_print_speculative_stats();
7426 #endif
7427 #endif
7428  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7429  __kmp_nested_nth.nth = NULL;
7430  __kmp_nested_nth.size = 0;
7431  __kmp_nested_nth.used = 0;
7432  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7433  __kmp_nested_proc_bind.bind_types = NULL;
7434  __kmp_nested_proc_bind.size = 0;
7435  __kmp_nested_proc_bind.used = 0;
7436  if (__kmp_affinity_format) {
7437  KMP_INTERNAL_FREE(__kmp_affinity_format);
7438  __kmp_affinity_format = NULL;
7439  }
7440 
7441  __kmp_i18n_catclose();
7442 
7443 #if KMP_USE_HIER_SCHED
7444  __kmp_hier_scheds.deallocate();
7445 #endif
7446 
7447 #if KMP_STATS_ENABLED
7448  __kmp_stats_fini();
7449 #endif
7450 
7451  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7452 }
7453 
7454 /* ------------------------------------------------------------------------ */
7455 
7456 int __kmp_ignore_mppbeg(void) {
7457  char *env;
7458 
7459  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7460  if (__kmp_str_match_false(env))
7461  return FALSE;
7462  }
7463  // By default __kmpc_begin() is no-op.
7464  return TRUE;
7465 }
7466 
7467 int __kmp_ignore_mppend(void) {
7468  char *env;
7469 
7470  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7471  if (__kmp_str_match_false(env))
7472  return FALSE;
7473  }
7474  // By default __kmpc_end() is no-op.
7475  return TRUE;
7476 }
7477 
7478 void __kmp_internal_begin(void) {
7479  int gtid;
7480  kmp_root_t *root;
7481 
7482  /* this is a very important step as it will register new sibling threads
7483  and assign these new uber threads a new gtid */
7484  gtid = __kmp_entry_gtid();
7485  root = __kmp_threads[gtid]->th.th_root;
7486  KMP_ASSERT(KMP_UBER_GTID(gtid));
7487 
7488  if (root->r.r_begin)
7489  return;
7490  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7491  if (root->r.r_begin) {
7492  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7493  return;
7494  }
7495 
7496  root->r.r_begin = TRUE;
7497 
7498  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7499 }
7500 
7501 /* ------------------------------------------------------------------------ */
7502 
7503 void __kmp_user_set_library(enum library_type arg) {
7504  int gtid;
7505  kmp_root_t *root;
7506  kmp_info_t *thread;
7507 
7508  /* first, make sure we are initialized so we can get our gtid */
7509 
7510  gtid = __kmp_entry_gtid();
7511  thread = __kmp_threads[gtid];
7512 
7513  root = thread->th.th_root;
7514 
7515  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7516  library_serial));
7517  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7518  thread */
7519  KMP_WARNING(SetLibraryIncorrectCall);
7520  return;
7521  }
7522 
7523  switch (arg) {
7524  case library_serial:
7525  thread->th.th_set_nproc = 0;
7526  set__nproc(thread, 1);
7527  break;
7528  case library_turnaround:
7529  thread->th.th_set_nproc = 0;
7530  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7531  : __kmp_dflt_team_nth_ub);
7532  break;
7533  case library_throughput:
7534  thread->th.th_set_nproc = 0;
7535  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7536  : __kmp_dflt_team_nth_ub);
7537  break;
7538  default:
7539  KMP_FATAL(UnknownLibraryType, arg);
7540  }
7541 
7542  __kmp_aux_set_library(arg);
7543 }
7544 
7545 void __kmp_aux_set_stacksize(size_t arg) {
7546  if (!__kmp_init_serial)
7547  __kmp_serial_initialize();
7548 
7549 #if KMP_OS_DARWIN
7550  if (arg & (0x1000 - 1)) {
7551  arg &= ~(0x1000 - 1);
7552  if (arg + 0x1000) /* check for overflow if we round up */
7553  arg += 0x1000;
7554  }
7555 #endif
7556  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7557 
7558  /* only change the default stacksize before the first parallel region */
7559  if (!TCR_4(__kmp_init_parallel)) {
7560  size_t value = arg; /* argument is in bytes */
7561 
7562  if (value < __kmp_sys_min_stksize)
7563  value = __kmp_sys_min_stksize;
7564  else if (value > KMP_MAX_STKSIZE)
7565  value = KMP_MAX_STKSIZE;
7566 
7567  __kmp_stksize = value;
7568 
7569  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7570  }
7571 
7572  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7573 }
7574 
7575 /* set the behaviour of the runtime library */
7576 /* TODO this can cause some odd behaviour with sibling parallelism... */
7577 void __kmp_aux_set_library(enum library_type arg) {
7578  __kmp_library = arg;
7579 
7580  switch (__kmp_library) {
7581  case library_serial: {
7582  KMP_INFORM(LibraryIsSerial);
7583  } break;
7584  case library_turnaround:
7585  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7586  __kmp_use_yield = 2; // only yield when oversubscribed
7587  break;
7588  case library_throughput:
7589  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7590  __kmp_dflt_blocktime = 200;
7591  break;
7592  default:
7593  KMP_FATAL(UnknownLibraryType, arg);
7594  }
7595 }
7596 
7597 /* Getting team information common for all team API */
7598 // Returns NULL if not in teams construct
7599 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7600  kmp_info_t *thr = __kmp_entry_thread();
7601  teams_serialized = 0;
7602  if (thr->th.th_teams_microtask) {
7603  kmp_team_t *team = thr->th.th_team;
7604  int tlevel = thr->th.th_teams_level; // the level of the teams construct
7605  int ii = team->t.t_level;
7606  teams_serialized = team->t.t_serialized;
7607  int level = tlevel + 1;
7608  KMP_DEBUG_ASSERT(ii >= tlevel);
7609  while (ii > level) {
7610  for (teams_serialized = team->t.t_serialized;
7611  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7612  }
7613  if (team->t.t_serialized && (!teams_serialized)) {
7614  team = team->t.t_parent;
7615  continue;
7616  }
7617  if (ii > level) {
7618  team = team->t.t_parent;
7619  ii--;
7620  }
7621  }
7622  return team;
7623  }
7624  return NULL;
7625 }
7626 
7627 int __kmp_aux_get_team_num() {
7628  int serialized;
7629  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7630  if (team) {
7631  if (serialized > 1) {
7632  return 0; // teams region is serialized ( 1 team of 1 thread ).
7633  } else {
7634  return team->t.t_master_tid;
7635  }
7636  }
7637  return 0;
7638 }
7639 
7640 int __kmp_aux_get_num_teams() {
7641  int serialized;
7642  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7643  if (team) {
7644  if (serialized > 1) {
7645  return 1;
7646  } else {
7647  return team->t.t_parent->t.t_nproc;
7648  }
7649  }
7650  return 1;
7651 }
7652 
7653 /* ------------------------------------------------------------------------ */
7654 
7655 /*
7656  * Affinity Format Parser
7657  *
7658  * Field is in form of: %[[[0].]size]type
7659  * % and type are required (%% means print a literal '%')
7660  * type is either single char or long name surrounded by {},
7661  * e.g., N or {num_threads}
7662  * 0 => leading zeros
7663  * . => right justified when size is specified
7664  * by default output is left justified
7665  * size is the *minimum* field length
7666  * All other characters are printed as is
7667  *
7668  * Available field types:
7669  * L {thread_level} - omp_get_level()
7670  * n {thread_num} - omp_get_thread_num()
7671  * h {host} - name of host machine
7672  * P {process_id} - process id (integer)
7673  * T {thread_identifier} - native thread identifier (integer)
7674  * N {num_threads} - omp_get_num_threads()
7675  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7676  * a {thread_affinity} - comma separated list of integers or integer ranges
7677  * (values of affinity mask)
7678  *
7679  * Implementation-specific field types can be added
7680  * If a type is unknown, print "undefined"
7681 */
7682 
7683 // Structure holding the short name, long name, and corresponding data type
7684 // for snprintf. A table of these will represent the entire valid keyword
7685 // field types.
7686 typedef struct kmp_affinity_format_field_t {
7687  char short_name; // from spec e.g., L -> thread level
7688  const char *long_name; // from spec thread_level -> thread level
7689  char field_format; // data type for snprintf (typically 'd' or 's'
7690  // for integer or string)
7691 } kmp_affinity_format_field_t;
7692 
7693 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7694 #if KMP_AFFINITY_SUPPORTED
7695  {'A', "thread_affinity", 's'},
7696 #endif
7697  {'t', "team_num", 'd'},
7698  {'T', "num_teams", 'd'},
7699  {'L', "nesting_level", 'd'},
7700  {'n', "thread_num", 'd'},
7701  {'N', "num_threads", 'd'},
7702  {'a', "ancestor_tnum", 'd'},
7703  {'H', "host", 's'},
7704  {'P', "process_id", 'd'},
7705  {'i', "native_thread_id", 'd'}};
7706 
7707 // Return the number of characters it takes to hold field
7708 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7709  const char **ptr,
7710  kmp_str_buf_t *field_buffer) {
7711  int rc, format_index, field_value;
7712  const char *width_left, *width_right;
7713  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7714  static const int FORMAT_SIZE = 20;
7715  char format[FORMAT_SIZE] = {0};
7716  char absolute_short_name = 0;
7717 
7718  KMP_DEBUG_ASSERT(gtid >= 0);
7719  KMP_DEBUG_ASSERT(th);
7720  KMP_DEBUG_ASSERT(**ptr == '%');
7721  KMP_DEBUG_ASSERT(field_buffer);
7722 
7723  __kmp_str_buf_clear(field_buffer);
7724 
7725  // Skip the initial %
7726  (*ptr)++;
7727 
7728  // Check for %% first
7729  if (**ptr == '%') {
7730  __kmp_str_buf_cat(field_buffer, "%", 1);
7731  (*ptr)++; // skip over the second %
7732  return 1;
7733  }
7734 
7735  // Parse field modifiers if they are present
7736  pad_zeros = false;
7737  if (**ptr == '0') {
7738  pad_zeros = true;
7739  (*ptr)++; // skip over 0
7740  }
7741  right_justify = false;
7742  if (**ptr == '.') {
7743  right_justify = true;
7744  (*ptr)++; // skip over .
7745  }
7746  // Parse width of field: [width_left, width_right)
7747  width_left = width_right = NULL;
7748  if (**ptr >= '0' && **ptr <= '9') {
7749  width_left = *ptr;
7750  SKIP_DIGITS(*ptr);
7751  width_right = *ptr;
7752  }
7753 
7754  // Create the format for KMP_SNPRINTF based on flags parsed above
7755  format_index = 0;
7756  format[format_index++] = '%';
7757  if (!right_justify)
7758  format[format_index++] = '-';
7759  if (pad_zeros)
7760  format[format_index++] = '0';
7761  if (width_left && width_right) {
7762  int i = 0;
7763  // Only allow 8 digit number widths.
7764  // This also prevents overflowing format variable
7765  while (i < 8 && width_left < width_right) {
7766  format[format_index++] = *width_left;
7767  width_left++;
7768  i++;
7769  }
7770  }
7771 
7772  // Parse a name (long or short)
7773  // Canonicalize the name into absolute_short_name
7774  found_valid_name = false;
7775  parse_long_name = (**ptr == '{');
7776  if (parse_long_name)
7777  (*ptr)++; // skip initial left brace
7778  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7779  sizeof(__kmp_affinity_format_table[0]);
7780  ++i) {
7781  char short_name = __kmp_affinity_format_table[i].short_name;
7782  const char *long_name = __kmp_affinity_format_table[i].long_name;
7783  char field_format = __kmp_affinity_format_table[i].field_format;
7784  if (parse_long_name) {
7785  int length = KMP_STRLEN(long_name);
7786  if (strncmp(*ptr, long_name, length) == 0) {
7787  found_valid_name = true;
7788  (*ptr) += length; // skip the long name
7789  }
7790  } else if (**ptr == short_name) {
7791  found_valid_name = true;
7792  (*ptr)++; // skip the short name
7793  }
7794  if (found_valid_name) {
7795  format[format_index++] = field_format;
7796  format[format_index++] = '\0';
7797  absolute_short_name = short_name;
7798  break;
7799  }
7800  }
7801  if (parse_long_name) {
7802  if (**ptr != '}') {
7803  absolute_short_name = 0;
7804  } else {
7805  (*ptr)++; // skip over the right brace
7806  }
7807  }
7808 
7809  // Attempt to fill the buffer with the requested
7810  // value using snprintf within __kmp_str_buf_print()
7811  switch (absolute_short_name) {
7812  case 't':
7813  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7814  break;
7815  case 'T':
7816  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7817  break;
7818  case 'L':
7819  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7820  break;
7821  case 'n':
7822  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7823  break;
7824  case 'H': {
7825  static const int BUFFER_SIZE = 256;
7826  char buf[BUFFER_SIZE];
7827  __kmp_expand_host_name(buf, BUFFER_SIZE);
7828  rc = __kmp_str_buf_print(field_buffer, format, buf);
7829  } break;
7830  case 'P':
7831  rc = __kmp_str_buf_print(field_buffer, format, getpid());
7832  break;
7833  case 'i':
7834  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7835  break;
7836  case 'N':
7837  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7838  break;
7839  case 'a':
7840  field_value =
7841  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7842  rc = __kmp_str_buf_print(field_buffer, format, field_value);
7843  break;
7844 #if KMP_AFFINITY_SUPPORTED
7845  case 'A': {
7846  kmp_str_buf_t buf;
7847  __kmp_str_buf_init(&buf);
7848  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7849  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7850  __kmp_str_buf_free(&buf);
7851  } break;
7852 #endif
7853  default:
7854  // According to spec, If an implementation does not have info for field
7855  // type, then "undefined" is printed
7856  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7857  // Skip the field
7858  if (parse_long_name) {
7859  SKIP_TOKEN(*ptr);
7860  if (**ptr == '}')
7861  (*ptr)++;
7862  } else {
7863  (*ptr)++;
7864  }
7865  }
7866 
7867  KMP_ASSERT(format_index <= FORMAT_SIZE);
7868  return rc;
7869 }
7870 
7871 /*
7872  * Return number of characters needed to hold the affinity string
7873  * (not including null byte character)
7874  * The resultant string is printed to buffer, which the caller can then
7875  * handle afterwards
7876 */
7877 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7878  kmp_str_buf_t *buffer) {
7879  const char *parse_ptr;
7880  size_t retval;
7881  const kmp_info_t *th;
7882  kmp_str_buf_t field;
7883 
7884  KMP_DEBUG_ASSERT(buffer);
7885  KMP_DEBUG_ASSERT(gtid >= 0);
7886 
7887  __kmp_str_buf_init(&field);
7888  __kmp_str_buf_clear(buffer);
7889 
7890  th = __kmp_threads[gtid];
7891  retval = 0;
7892 
7893  // If format is NULL or zero-length string, then we use
7894  // affinity-format-var ICV
7895  parse_ptr = format;
7896  if (parse_ptr == NULL || *parse_ptr == '\0') {
7897  parse_ptr = __kmp_affinity_format;
7898  }
7899  KMP_DEBUG_ASSERT(parse_ptr);
7900 
7901  while (*parse_ptr != '\0') {
7902  // Parse a field
7903  if (*parse_ptr == '%') {
7904  // Put field in the buffer
7905  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7906  __kmp_str_buf_catbuf(buffer, &field);
7907  retval += rc;
7908  } else {
7909  // Put literal character in buffer
7910  __kmp_str_buf_cat(buffer, parse_ptr, 1);
7911  retval++;
7912  parse_ptr++;
7913  }
7914  }
7915  __kmp_str_buf_free(&field);
7916  return retval;
7917 }
7918 
7919 // Displays the affinity string to stdout
7920 void __kmp_aux_display_affinity(int gtid, const char *format) {
7921  kmp_str_buf_t buf;
7922  __kmp_str_buf_init(&buf);
7923  __kmp_aux_capture_affinity(gtid, format, &buf);
7924  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7925  __kmp_str_buf_free(&buf);
7926 }
7927 
7928 /* ------------------------------------------------------------------------ */
7929 
7930 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7931  int blocktime = arg; /* argument is in milliseconds */
7932 #if KMP_USE_MONITOR
7933  int bt_intervals;
7934 #endif
7935  int bt_set;
7936 
7937  __kmp_save_internal_controls(thread);
7938 
7939  /* Normalize and set blocktime for the teams */
7940  if (blocktime < KMP_MIN_BLOCKTIME)
7941  blocktime = KMP_MIN_BLOCKTIME;
7942  else if (blocktime > KMP_MAX_BLOCKTIME)
7943  blocktime = KMP_MAX_BLOCKTIME;
7944 
7945  set__blocktime_team(thread->th.th_team, tid, blocktime);
7946  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7947 
7948 #if KMP_USE_MONITOR
7949  /* Calculate and set blocktime intervals for the teams */
7950  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7951 
7952  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7953  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7954 #endif
7955 
7956  /* Set whether blocktime has been set to "TRUE" */
7957  bt_set = TRUE;
7958 
7959  set__bt_set_team(thread->th.th_team, tid, bt_set);
7960  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7961 #if KMP_USE_MONITOR
7962  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7963  "bt_intervals=%d, monitor_updates=%d\n",
7964  __kmp_gtid_from_tid(tid, thread->th.th_team),
7965  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7966  __kmp_monitor_wakeups));
7967 #else
7968  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7969  __kmp_gtid_from_tid(tid, thread->th.th_team),
7970  thread->th.th_team->t.t_id, tid, blocktime));
7971 #endif
7972 }
7973 
7974 void __kmp_aux_set_defaults(char const *str, int len) {
7975  if (!__kmp_init_serial) {
7976  __kmp_serial_initialize();
7977  }
7978  __kmp_env_initialize(str);
7979 
7980  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
7981  __kmp_env_print();
7982  }
7983 } // __kmp_aux_set_defaults
7984 
7985 /* ------------------------------------------------------------------------ */
7986 /* internal fast reduction routines */
7987 
7988 PACKED_REDUCTION_METHOD_T
7989 __kmp_determine_reduction_method(
7990  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7991  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7992  kmp_critical_name *lck) {
7993 
7994  // Default reduction method: critical construct ( lck != NULL, like in current
7995  // PAROPT )
7996  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7997  // can be selected by RTL
7998  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7999  // can be selected by RTL
8000  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8001  // among generated by PAROPT.
8002 
8003  PACKED_REDUCTION_METHOD_T retval;
8004 
8005  int team_size;
8006 
8007  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8008  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8009 
8010 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8011  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8012 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8013 
8014  retval = critical_reduce_block;
8015 
8016  // another choice of getting a team size (with 1 dynamic deference) is slower
8017  team_size = __kmp_get_team_num_threads(global_tid);
8018  if (team_size == 1) {
8019 
8020  retval = empty_reduce_block;
8021 
8022  } else {
8023 
8024  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8025 
8026 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8027 
8028 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8029  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || KMP_OS_KFREEBSD
8030 
8031  int teamsize_cutoff = 4;
8032 
8033 #if KMP_MIC_SUPPORTED
8034  if (__kmp_mic_type != non_mic) {
8035  teamsize_cutoff = 8;
8036  }
8037 #endif
8038  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8039  if (tree_available) {
8040  if (team_size <= teamsize_cutoff) {
8041  if (atomic_available) {
8042  retval = atomic_reduce_block;
8043  }
8044  } else {
8045  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8046  }
8047  } else if (atomic_available) {
8048  retval = atomic_reduce_block;
8049  }
8050 #else
8051 #error "Unknown or unsupported OS"
8052 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8053  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8054 
8055 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8056 
8057 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_KFREEBSD
8058 
8059  // basic tuning
8060 
8061  if (atomic_available) {
8062  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8063  retval = atomic_reduce_block;
8064  }
8065  } // otherwise: use critical section
8066 
8067 #elif KMP_OS_DARWIN
8068 
8069  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8070  if (atomic_available && (num_vars <= 3)) {
8071  retval = atomic_reduce_block;
8072  } else if (tree_available) {
8073  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8074  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8075  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8076  }
8077  } // otherwise: use critical section
8078 
8079 #else
8080 #error "Unknown or unsupported OS"
8081 #endif
8082 
8083 #else
8084 #error "Unknown or unsupported architecture"
8085 #endif
8086  }
8087 
8088  // KMP_FORCE_REDUCTION
8089 
8090  // If the team is serialized (team_size == 1), ignore the forced reduction
8091  // method and stay with the unsynchronized method (empty_reduce_block)
8092  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8093  team_size != 1) {
8094 
8095  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8096 
8097  int atomic_available, tree_available;
8098 
8099  switch ((forced_retval = __kmp_force_reduction_method)) {
8100  case critical_reduce_block:
8101  KMP_ASSERT(lck); // lck should be != 0
8102  break;
8103 
8104  case atomic_reduce_block:
8105  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8106  if (!atomic_available) {
8107  KMP_WARNING(RedMethodNotSupported, "atomic");
8108  forced_retval = critical_reduce_block;
8109  }
8110  break;
8111 
8112  case tree_reduce_block:
8113  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8114  if (!tree_available) {
8115  KMP_WARNING(RedMethodNotSupported, "tree");
8116  forced_retval = critical_reduce_block;
8117  } else {
8118 #if KMP_FAST_REDUCTION_BARRIER
8119  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8120 #endif
8121  }
8122  break;
8123 
8124  default:
8125  KMP_ASSERT(0); // "unsupported method specified"
8126  }
8127 
8128  retval = forced_retval;
8129  }
8130 
8131  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8132 
8133 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8134 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8135 
8136  return (retval);
8137 }
8138 
8139 // this function is for testing set/get/determine reduce method
8140 kmp_int32 __kmp_get_reduce_method(void) {
8141  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8142 }
8143 
8144 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8145 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8146 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8147 
8148 // Hard pause shuts down the runtime completely. Resume happens naturally when
8149 // OpenMP is used subsequently.
8150 void __kmp_hard_pause() {
8151  __kmp_pause_status = kmp_hard_paused;
8152  __kmp_internal_end_thread(-1);
8153 }
8154 
8155 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8156 void __kmp_resume_if_soft_paused() {
8157  if (__kmp_pause_status == kmp_soft_paused) {
8158  __kmp_pause_status = kmp_not_paused;
8159 
8160  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8161  kmp_info_t *thread = __kmp_threads[gtid];
8162  if (thread) { // Wake it if sleeping
8163  kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8164  if (fl.is_sleeping())
8165  fl.resume(gtid);
8166  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8167  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8168  } else { // thread holds the lock and may sleep soon
8169  do { // until either the thread sleeps, or we can get the lock
8170  if (fl.is_sleeping()) {
8171  fl.resume(gtid);
8172  break;
8173  } else if (__kmp_try_suspend_mx(thread)) {
8174  __kmp_unlock_suspend_mx(thread);
8175  break;
8176  }
8177  } while (1);
8178  }
8179  }
8180  }
8181  }
8182 }
8183 
8184 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8185 // TODO: add warning messages
8186 int __kmp_pause_resource(kmp_pause_status_t level) {
8187  if (level == kmp_not_paused) { // requesting resume
8188  if (__kmp_pause_status == kmp_not_paused) {
8189  // error message about runtime not being paused, so can't resume
8190  return 1;
8191  } else {
8192  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8193  __kmp_pause_status == kmp_hard_paused);
8194  __kmp_pause_status = kmp_not_paused;
8195  return 0;
8196  }
8197  } else if (level == kmp_soft_paused) { // requesting soft pause
8198  if (__kmp_pause_status != kmp_not_paused) {
8199  // error message about already being paused
8200  return 1;
8201  } else {
8202  __kmp_soft_pause();
8203  return 0;
8204  }
8205  } else if (level == kmp_hard_paused) { // requesting hard pause
8206  if (__kmp_pause_status != kmp_not_paused) {
8207  // error message about already being paused
8208  return 1;
8209  } else {
8210  __kmp_hard_pause();
8211  return 0;
8212  }
8213  } else {
8214  // error message about invalid level
8215  return 1;
8216  }
8217 }
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:887
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:929
sched_type
Definition: kmp.h:336
Definition: kmp.h:222
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
kmp_int32 flags
Definition: kmp.h:224