LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #if OMPT_SUPPORT
19 #include "ompt-specific.h"
20 #endif
21 
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
26 
27 #include "tsan_annotations.h"
28 
29 #if KMP_MIC && USE_NGO_STORES
30 // ICV copying
31 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
32 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
35 #else
36 #define ngo_load(src) ((void)0)
37 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
38 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
39 #define ngo_sync() ((void)0)
40 #endif /* KMP_MIC && USE_NGO_STORES */
41 
42 void __kmp_print_structure(void); // Forward declaration
43 
44 // ---------------------------- Barrier Algorithms ----------------------------
45 
46 // Linear Barrier
47 template <bool cancellable = false>
48 static bool __kmp_linear_barrier_gather_template(
49  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
51  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52  kmp_team_t *team = this_thr->th.th_team;
53  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54  kmp_info_t **other_threads = team->t.t_threads;
55 
56  KA_TRACE(
57  20,
58  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59  gtid, team->t.t_id, tid, bt));
60  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61 
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63  // Barrier imbalance - save arrive time to the thread
64  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66  __itt_get_timestamp();
67  }
68 #endif
69  // We now perform a linear reduction to signal that all of the threads have
70  // arrived.
71  if (!KMP_MASTER_TID(tid)) {
72  KA_TRACE(20,
73  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74  "arrived(%p): %llu => %llu\n",
75  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78  // Mark arrival to master thread
79  /* After performing this write, a worker thread may not assume that the team
80  is valid any more - it could be deallocated by the master thread at any
81  time. */
82  ANNOTATE_BARRIER_BEGIN(this_thr);
83  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
84  flag.release();
85  } else {
86  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87  int nproc = this_thr->th.th_team_nproc;
88  int i;
89  // Don't have to worry about sleep bit here or atomic since team setting
90  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
91 
92  // Collect all the worker team member threads.
93  for (i = 1; i < nproc; ++i) {
94 #if KMP_CACHE_MANAGE
95  // Prefetch next thread's arrived count
96  if (i + 1 < nproc)
97  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
98 #endif /* KMP_CACHE_MANAGE */
99  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
100  "arrived(%p) == %llu\n",
101  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
102  team->t.t_id, i,
103  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
104 
105  // Wait for worker thread to arrive
106  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
107  new_state);
108  if (cancellable) {
109  bool cancelled = flag.wait_cancellable_nosleep(
110  this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
111  if (cancelled)
112  return true;
113  } else {
114  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
115  }
116  ANNOTATE_BARRIER_END(other_threads[i]);
117 #if USE_ITT_BUILD && USE_ITT_NOTIFY
118  // Barrier imbalance - write min of the thread time and the other thread
119  // time to the thread.
120  if (__kmp_forkjoin_frames_mode == 2) {
121  this_thr->th.th_bar_min_time = KMP_MIN(
122  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
123  }
124 #endif
125  if (reduce) {
126  KA_TRACE(100,
127  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
128  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
129  team->t.t_id, i));
130  ANNOTATE_REDUCE_AFTER(reduce);
131  (*reduce)(this_thr->th.th_local.reduce_data,
132  other_threads[i]->th.th_local.reduce_data);
133  ANNOTATE_REDUCE_BEFORE(reduce);
134  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
135  }
136  }
137  // Don't have to worry about sleep bit here or atomic since team setting
138  team_bar->b_arrived = new_state;
139  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
140  "arrived(%p) = %llu\n",
141  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
142  new_state));
143  }
144  KA_TRACE(
145  20,
146  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
147  gtid, team->t.t_id, tid, bt));
148  return false;
149 }
150 
151 template <bool cancellable = false>
152 static bool __kmp_linear_barrier_release_template(
153  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
154  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
155  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
156  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
157  kmp_team_t *team;
158 
159  if (KMP_MASTER_TID(tid)) {
160  unsigned int i;
161  kmp_uint32 nproc = this_thr->th.th_team_nproc;
162  kmp_info_t **other_threads;
163 
164  team = __kmp_threads[gtid]->th.th_team;
165  KMP_DEBUG_ASSERT(team != NULL);
166  other_threads = team->t.t_threads;
167 
168  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
169  "barrier type %d\n",
170  gtid, team->t.t_id, tid, bt));
171 
172  if (nproc > 1) {
173 #if KMP_BARRIER_ICV_PUSH
174  {
175  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
176  if (propagate_icvs) {
177  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
178  for (i = 1; i < nproc; ++i) {
179  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
180  team, i, FALSE);
181  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
182  &team->t.t_implicit_task_taskdata[0].td_icvs);
183  }
184  ngo_sync();
185  }
186  }
187 #endif // KMP_BARRIER_ICV_PUSH
188 
189  // Now, release all of the worker threads
190  for (i = 1; i < nproc; ++i) {
191 #if KMP_CACHE_MANAGE
192  // Prefetch next thread's go flag
193  if (i + 1 < nproc)
194  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
195 #endif /* KMP_CACHE_MANAGE */
196  KA_TRACE(
197  20,
198  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
199  "go(%p): %u => %u\n",
200  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
201  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
202  other_threads[i]->th.th_bar[bt].bb.b_go,
203  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
204  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
205  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
206  other_threads[i]);
207  flag.release();
208  }
209  }
210  } else { // Wait for the MASTER thread to release us
211  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
212  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
213  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
214  if (cancellable) {
215  bool cancelled = flag.wait_cancellable_nosleep(
216  this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
217  if (cancelled) {
218  return true;
219  }
220  } else {
221  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
222  }
223  ANNOTATE_BARRIER_END(this_thr);
224 #if USE_ITT_BUILD && USE_ITT_NOTIFY
225  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
226  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
227  // disabled)
228  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
229  // Cancel wait on previous parallel region...
230  __kmp_itt_task_starting(itt_sync_obj);
231 
232  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
233  return false;
234 
235  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
236  if (itt_sync_obj != NULL)
237  // Call prepare as early as possible for "new" barrier
238  __kmp_itt_task_finished(itt_sync_obj);
239  } else
240 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
241  // Early exit for reaping threads releasing forkjoin barrier
242  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
243  return false;
244 // The worker thread may now assume that the team is valid.
245 #ifdef KMP_DEBUG
246  tid = __kmp_tid_from_gtid(gtid);
247  team = __kmp_threads[gtid]->th.th_team;
248 #endif
249  KMP_DEBUG_ASSERT(team != NULL);
250  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
251  KA_TRACE(20,
252  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
253  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
254  KMP_MB(); // Flush all pending memory write invalidates.
255  }
256  KA_TRACE(
257  20,
258  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
259  gtid, team->t.t_id, tid, bt));
260  return false;
261 }
262 
263 static void __kmp_linear_barrier_gather(
264  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
265  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
266  __kmp_linear_barrier_gather_template<false>(
267  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
268 }
269 
270 static bool __kmp_linear_barrier_gather_cancellable(
271  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
272  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
273  return __kmp_linear_barrier_gather_template<true>(
274  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
275 }
276 
277 static void __kmp_linear_barrier_release(
278  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
279  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
280  __kmp_linear_barrier_release_template<false>(
281  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
282 }
283 
284 static bool __kmp_linear_barrier_release_cancellable(
285  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
286  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
287  return __kmp_linear_barrier_release_template<true>(
288  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
289 }
290 
291 // Tree barrier
292 static void
293 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
294  int tid, void (*reduce)(void *, void *)
295  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
296  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
297  kmp_team_t *team = this_thr->th.th_team;
298  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
299  kmp_info_t **other_threads = team->t.t_threads;
300  kmp_uint32 nproc = this_thr->th.th_team_nproc;
301  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
302  kmp_uint32 branch_factor = 1 << branch_bits;
303  kmp_uint32 child;
304  kmp_uint32 child_tid;
305  kmp_uint64 new_state;
306 
307  KA_TRACE(
308  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
309  gtid, team->t.t_id, tid, bt));
310  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
311 
312 #if USE_ITT_BUILD && USE_ITT_NOTIFY
313  // Barrier imbalance - save arrive time to the thread
314  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
315  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
316  __itt_get_timestamp();
317  }
318 #endif
319  // Perform tree gather to wait until all threads have arrived; reduce any
320  // required data as we go
321  child_tid = (tid << branch_bits) + 1;
322  if (child_tid < nproc) {
323  // Parent threads wait for all their children to arrive
324  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
325  child = 1;
326  do {
327  kmp_info_t *child_thr = other_threads[child_tid];
328  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
329 #if KMP_CACHE_MANAGE
330  // Prefetch next thread's arrived count
331  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
332  KMP_CACHE_PREFETCH(
333  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
334 #endif /* KMP_CACHE_MANAGE */
335  KA_TRACE(20,
336  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
337  "arrived(%p) == %llu\n",
338  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
339  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
340  // Wait for child to arrive
341  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
342  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
343  ANNOTATE_BARRIER_END(child_thr);
344 #if USE_ITT_BUILD && USE_ITT_NOTIFY
345  // Barrier imbalance - write min of the thread time and a child time to
346  // the thread.
347  if (__kmp_forkjoin_frames_mode == 2) {
348  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
349  child_thr->th.th_bar_min_time);
350  }
351 #endif
352  if (reduce) {
353  KA_TRACE(100,
354  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
355  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
356  team->t.t_id, child_tid));
357  ANNOTATE_REDUCE_AFTER(reduce);
358  (*reduce)(this_thr->th.th_local.reduce_data,
359  child_thr->th.th_local.reduce_data);
360  ANNOTATE_REDUCE_BEFORE(reduce);
361  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
362  }
363  child++;
364  child_tid++;
365  } while (child <= branch_factor && child_tid < nproc);
366  }
367 
368  if (!KMP_MASTER_TID(tid)) { // Worker threads
369  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
370 
371  KA_TRACE(20,
372  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
373  "arrived(%p): %llu => %llu\n",
374  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
375  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
376  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
377 
378  // Mark arrival to parent thread
379  /* After performing this write, a worker thread may not assume that the team
380  is valid any more - it could be deallocated by the master thread at any
381  time. */
382  ANNOTATE_BARRIER_BEGIN(this_thr);
383  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
384  flag.release();
385  } else {
386  // Need to update the team arrived pointer if we are the master thread
387  if (nproc > 1) // New value was already computed above
388  team->t.t_bar[bt].b_arrived = new_state;
389  else
390  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
391  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
392  "arrived(%p) = %llu\n",
393  gtid, team->t.t_id, tid, team->t.t_id,
394  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
395  }
396  KA_TRACE(20,
397  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
398  gtid, team->t.t_id, tid, bt));
399 }
400 
401 static void __kmp_tree_barrier_release(
402  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
403  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
404  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
405  kmp_team_t *team;
406  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
407  kmp_uint32 nproc;
408  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
409  kmp_uint32 branch_factor = 1 << branch_bits;
410  kmp_uint32 child;
411  kmp_uint32 child_tid;
412 
413  // Perform a tree release for all of the threads that have been gathered
414  if (!KMP_MASTER_TID(
415  tid)) { // Handle fork barrier workers who aren't part of a team yet
416  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
417  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
418  // Wait for parent thread to release us
419  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
420  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
421  ANNOTATE_BARRIER_END(this_thr);
422 #if USE_ITT_BUILD && USE_ITT_NOTIFY
423  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
424  // In fork barrier where we could not get the object reliably (or
425  // ITTNOTIFY is disabled)
426  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
427  // Cancel wait on previous parallel region...
428  __kmp_itt_task_starting(itt_sync_obj);
429 
430  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
431  return;
432 
433  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
434  if (itt_sync_obj != NULL)
435  // Call prepare as early as possible for "new" barrier
436  __kmp_itt_task_finished(itt_sync_obj);
437  } else
438 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
439  // Early exit for reaping threads releasing forkjoin barrier
440  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
441  return;
442 
443  // The worker thread may now assume that the team is valid.
444  team = __kmp_threads[gtid]->th.th_team;
445  KMP_DEBUG_ASSERT(team != NULL);
446  tid = __kmp_tid_from_gtid(gtid);
447 
448  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
449  KA_TRACE(20,
450  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
451  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
452  KMP_MB(); // Flush all pending memory write invalidates.
453  } else {
454  team = __kmp_threads[gtid]->th.th_team;
455  KMP_DEBUG_ASSERT(team != NULL);
456  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
457  "barrier type %d\n",
458  gtid, team->t.t_id, tid, bt));
459  }
460  nproc = this_thr->th.th_team_nproc;
461  child_tid = (tid << branch_bits) + 1;
462 
463  if (child_tid < nproc) {
464  kmp_info_t **other_threads = team->t.t_threads;
465  child = 1;
466  // Parent threads release all their children
467  do {
468  kmp_info_t *child_thr = other_threads[child_tid];
469  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
470 #if KMP_CACHE_MANAGE
471  // Prefetch next thread's go count
472  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
473  KMP_CACHE_PREFETCH(
474  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
475 #endif /* KMP_CACHE_MANAGE */
476 
477 #if KMP_BARRIER_ICV_PUSH
478  {
479  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
480  if (propagate_icvs) {
481  __kmp_init_implicit_task(team->t.t_ident,
482  team->t.t_threads[child_tid], team,
483  child_tid, FALSE);
484  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
485  &team->t.t_implicit_task_taskdata[0].td_icvs);
486  }
487  }
488 #endif // KMP_BARRIER_ICV_PUSH
489  KA_TRACE(20,
490  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
491  "go(%p): %u => %u\n",
492  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
493  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
494  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
495  // Release child from barrier
496  ANNOTATE_BARRIER_BEGIN(child_thr);
497  kmp_flag_64 flag(&child_bar->b_go, child_thr);
498  flag.release();
499  child++;
500  child_tid++;
501  } while (child <= branch_factor && child_tid < nproc);
502  }
503  KA_TRACE(
504  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
505  gtid, team->t.t_id, tid, bt));
506 }
507 
508 // Hyper Barrier
509 static void
510 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
511  int tid, void (*reduce)(void *, void *)
512  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
513  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
514  kmp_team_t *team = this_thr->th.th_team;
515  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
516  kmp_info_t **other_threads = team->t.t_threads;
517  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
518  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
519  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
520  kmp_uint32 branch_factor = 1 << branch_bits;
521  kmp_uint32 offset;
522  kmp_uint32 level;
523 
524  KA_TRACE(
525  20,
526  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
527  gtid, team->t.t_id, tid, bt));
528  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
529 
530 #if USE_ITT_BUILD && USE_ITT_NOTIFY
531  // Barrier imbalance - save arrive time to the thread
532  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
533  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
534  __itt_get_timestamp();
535  }
536 #endif
537  /* Perform a hypercube-embedded tree gather to wait until all of the threads
538  have arrived, and reduce any required data as we go. */
539  kmp_flag_64 p_flag(&thr_bar->b_arrived);
540  for (level = 0, offset = 1; offset < num_threads;
541  level += branch_bits, offset <<= branch_bits) {
542  kmp_uint32 child;
543  kmp_uint32 child_tid;
544 
545  if (((tid >> level) & (branch_factor - 1)) != 0) {
546  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
547 
548  KA_TRACE(20,
549  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
550  "arrived(%p): %llu => %llu\n",
551  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
552  team->t.t_id, parent_tid, &thr_bar->b_arrived,
553  thr_bar->b_arrived,
554  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
555  // Mark arrival to parent thread
556  /* After performing this write (in the last iteration of the enclosing for
557  loop), a worker thread may not assume that the team is valid any more
558  - it could be deallocated by the master thread at any time. */
559  ANNOTATE_BARRIER_BEGIN(this_thr);
560  p_flag.set_waiter(other_threads[parent_tid]);
561  p_flag.release();
562  break;
563  }
564 
565  // Parent threads wait for children to arrive
566  if (new_state == KMP_BARRIER_UNUSED_STATE)
567  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
568  for (child = 1, child_tid = tid + (1 << level);
569  child < branch_factor && child_tid < num_threads;
570  child++, child_tid += (1 << level)) {
571  kmp_info_t *child_thr = other_threads[child_tid];
572  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
573 #if KMP_CACHE_MANAGE
574  kmp_uint32 next_child_tid = child_tid + (1 << level);
575  // Prefetch next thread's arrived count
576  if (child + 1 < branch_factor && next_child_tid < num_threads)
577  KMP_CACHE_PREFETCH(
578  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
579 #endif /* KMP_CACHE_MANAGE */
580  KA_TRACE(20,
581  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
582  "arrived(%p) == %llu\n",
583  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
584  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
585  // Wait for child to arrive
586  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
587  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
588  ANNOTATE_BARRIER_END(child_thr);
589 #if USE_ITT_BUILD && USE_ITT_NOTIFY
590  // Barrier imbalance - write min of the thread time and a child time to
591  // the thread.
592  if (__kmp_forkjoin_frames_mode == 2) {
593  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
594  child_thr->th.th_bar_min_time);
595  }
596 #endif
597  if (reduce) {
598  KA_TRACE(100,
599  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
600  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
601  team->t.t_id, child_tid));
602  ANNOTATE_REDUCE_AFTER(reduce);
603  (*reduce)(this_thr->th.th_local.reduce_data,
604  child_thr->th.th_local.reduce_data);
605  ANNOTATE_REDUCE_BEFORE(reduce);
606  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
607  }
608  }
609  }
610 
611  if (KMP_MASTER_TID(tid)) {
612  // Need to update the team arrived pointer if we are the master thread
613  if (new_state == KMP_BARRIER_UNUSED_STATE)
614  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
615  else
616  team->t.t_bar[bt].b_arrived = new_state;
617  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
618  "arrived(%p) = %llu\n",
619  gtid, team->t.t_id, tid, team->t.t_id,
620  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
621  }
622  KA_TRACE(
623  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
624  gtid, team->t.t_id, tid, bt));
625 }
626 
627 // The reverse versions seem to beat the forward versions overall
628 #define KMP_REVERSE_HYPER_BAR
629 static void __kmp_hyper_barrier_release(
630  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
631  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
632  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
633  kmp_team_t *team;
634  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
635  kmp_info_t **other_threads;
636  kmp_uint32 num_threads;
637  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
638  kmp_uint32 branch_factor = 1 << branch_bits;
639  kmp_uint32 child;
640  kmp_uint32 child_tid;
641  kmp_uint32 offset;
642  kmp_uint32 level;
643 
644  /* Perform a hypercube-embedded tree release for all of the threads that have
645  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
646  are released in the reverse order of the corresponding gather, otherwise
647  threads are released in the same order. */
648  if (KMP_MASTER_TID(tid)) { // master
649  team = __kmp_threads[gtid]->th.th_team;
650  KMP_DEBUG_ASSERT(team != NULL);
651  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
652  "barrier type %d\n",
653  gtid, team->t.t_id, tid, bt));
654 #if KMP_BARRIER_ICV_PUSH
655  if (propagate_icvs) { // master already has ICVs in final destination; copy
656  copy_icvs(&thr_bar->th_fixed_icvs,
657  &team->t.t_implicit_task_taskdata[tid].td_icvs);
658  }
659 #endif
660  } else { // Handle fork barrier workers who aren't part of a team yet
661  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
662  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
663  // Wait for parent thread to release us
664  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
665  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
666  ANNOTATE_BARRIER_END(this_thr);
667 #if USE_ITT_BUILD && USE_ITT_NOTIFY
668  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
669  // In fork barrier where we could not get the object reliably
670  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
671  // Cancel wait on previous parallel region...
672  __kmp_itt_task_starting(itt_sync_obj);
673 
674  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
675  return;
676 
677  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
678  if (itt_sync_obj != NULL)
679  // Call prepare as early as possible for "new" barrier
680  __kmp_itt_task_finished(itt_sync_obj);
681  } else
682 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
683  // Early exit for reaping threads releasing forkjoin barrier
684  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
685  return;
686 
687  // The worker thread may now assume that the team is valid.
688  team = __kmp_threads[gtid]->th.th_team;
689  KMP_DEBUG_ASSERT(team != NULL);
690  tid = __kmp_tid_from_gtid(gtid);
691 
692  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
693  KA_TRACE(20,
694  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
695  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
696  KMP_MB(); // Flush all pending memory write invalidates.
697  }
698  num_threads = this_thr->th.th_team_nproc;
699  other_threads = team->t.t_threads;
700 
701 #ifdef KMP_REVERSE_HYPER_BAR
702  // Count up to correct level for parent
703  for (level = 0, offset = 1;
704  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
705  level += branch_bits, offset <<= branch_bits)
706  ;
707 
708  // Now go down from there
709  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
710  level -= branch_bits, offset >>= branch_bits)
711 #else
712  // Go down the tree, level by level
713  for (level = 0, offset = 1; offset < num_threads;
714  level += branch_bits, offset <<= branch_bits)
715 #endif // KMP_REVERSE_HYPER_BAR
716  {
717 #ifdef KMP_REVERSE_HYPER_BAR
718  /* Now go in reverse order through the children, highest to lowest.
719  Initial setting of child is conservative here. */
720  child = num_threads >> ((level == 0) ? level : level - 1);
721  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
722  child_tid = tid + (child << level);
723  child >= 1; child--, child_tid -= (1 << level))
724 #else
725  if (((tid >> level) & (branch_factor - 1)) != 0)
726  // No need to go lower than this, since this is the level parent would be
727  // notified
728  break;
729  // Iterate through children on this level of the tree
730  for (child = 1, child_tid = tid + (1 << level);
731  child < branch_factor && child_tid < num_threads;
732  child++, child_tid += (1 << level))
733 #endif // KMP_REVERSE_HYPER_BAR
734  {
735  if (child_tid >= num_threads)
736  continue; // Child doesn't exist so keep going
737  else {
738  kmp_info_t *child_thr = other_threads[child_tid];
739  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
740 #if KMP_CACHE_MANAGE
741  kmp_uint32 next_child_tid = child_tid - (1 << level);
742 // Prefetch next thread's go count
743 #ifdef KMP_REVERSE_HYPER_BAR
744  if (child - 1 >= 1 && next_child_tid < num_threads)
745 #else
746  if (child + 1 < branch_factor && next_child_tid < num_threads)
747 #endif // KMP_REVERSE_HYPER_BAR
748  KMP_CACHE_PREFETCH(
749  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
750 #endif /* KMP_CACHE_MANAGE */
751 
752 #if KMP_BARRIER_ICV_PUSH
753  if (propagate_icvs) // push my fixed ICVs to my child
754  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
755 #endif // KMP_BARRIER_ICV_PUSH
756 
757  KA_TRACE(
758  20,
759  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
760  "go(%p): %u => %u\n",
761  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
762  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
763  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
764  // Release child from barrier
765  ANNOTATE_BARRIER_BEGIN(child_thr);
766  kmp_flag_64 flag(&child_bar->b_go, child_thr);
767  flag.release();
768  }
769  }
770  }
771 #if KMP_BARRIER_ICV_PUSH
772  if (propagate_icvs &&
773  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
774  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
775  FALSE);
776  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
777  &thr_bar->th_fixed_icvs);
778  }
779 #endif
780  KA_TRACE(
781  20,
782  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
783  gtid, team->t.t_id, tid, bt));
784 }
785 
786 // Hierarchical Barrier
787 
788 // Initialize thread barrier data
789 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
790  Performs the minimum amount of initialization required based on how the team
791  has changed. Returns true if leaf children will require both on-core and
792  traditional wake-up mechanisms. For example, if the team size increases,
793  threads already in the team will respond to on-core wakeup on their parent
794  thread, but threads newly added to the team will only be listening on the
795  their local b_go. */
796 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
797  kmp_bstate_t *thr_bar,
798  kmp_uint32 nproc, int gtid,
799  int tid, kmp_team_t *team) {
800  // Checks to determine if (re-)initialization is needed
801  bool uninitialized = thr_bar->team == NULL;
802  bool team_changed = team != thr_bar->team;
803  bool team_sz_changed = nproc != thr_bar->nproc;
804  bool tid_changed = tid != thr_bar->old_tid;
805  bool retval = false;
806 
807  if (uninitialized || team_sz_changed) {
808  __kmp_get_hierarchy(nproc, thr_bar);
809  }
810 
811  if (uninitialized || team_sz_changed || tid_changed) {
812  thr_bar->my_level = thr_bar->depth - 1; // default for master
813  thr_bar->parent_tid = -1; // default for master
814  if (!KMP_MASTER_TID(
815  tid)) { // if not master, find parent thread in hierarchy
816  kmp_uint32 d = 0;
817  while (d < thr_bar->depth) { // find parent based on level of thread in
818  // hierarchy, and note level
819  kmp_uint32 rem;
820  if (d == thr_bar->depth - 2) { // reached level right below the master
821  thr_bar->parent_tid = 0;
822  thr_bar->my_level = d;
823  break;
824  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
825  0) { // TODO: can we make this op faster?
826  // thread is not a subtree root at next level, so this is max
827  thr_bar->parent_tid = tid - rem;
828  thr_bar->my_level = d;
829  break;
830  }
831  ++d;
832  }
833  }
834  thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
835  thr_bar->old_tid = tid;
836  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
837  thr_bar->team = team;
838  thr_bar->parent_bar =
839  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
840  }
841  if (uninitialized || team_changed || tid_changed) {
842  thr_bar->team = team;
843  thr_bar->parent_bar =
844  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
845  retval = true;
846  }
847  if (uninitialized || team_sz_changed || tid_changed) {
848  thr_bar->nproc = nproc;
849  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
850  if (thr_bar->my_level == 0)
851  thr_bar->leaf_kids = 0;
852  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
853  thr_bar->leaf_kids = nproc - tid - 1;
854  thr_bar->leaf_state = 0;
855  for (int i = 0; i < thr_bar->leaf_kids; ++i)
856  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
857  }
858  return retval;
859 }
860 
861 static void __kmp_hierarchical_barrier_gather(
862  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
863  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
864  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
865  kmp_team_t *team = this_thr->th.th_team;
866  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
867  kmp_uint32 nproc = this_thr->th.th_team_nproc;
868  kmp_info_t **other_threads = team->t.t_threads;
869  kmp_uint64 new_state;
870 
871  int level = team->t.t_level;
872  if (other_threads[0]
873  ->th.th_teams_microtask) // are we inside the teams construct?
874  if (this_thr->th.th_teams_size.nteams > 1)
875  ++level; // level was not increased in teams construct for team_of_masters
876  if (level == 1)
877  thr_bar->use_oncore_barrier = 1;
878  else
879  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
880 
881  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
882  "barrier type %d\n",
883  gtid, team->t.t_id, tid, bt));
884  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
885 
886 #if USE_ITT_BUILD && USE_ITT_NOTIFY
887  // Barrier imbalance - save arrive time to the thread
888  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
889  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
890  }
891 #endif
892 
893  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
894  team);
895 
896  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
897  kmp_int32 child_tid;
898  new_state =
899  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
900  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
901  thr_bar->use_oncore_barrier) {
902  if (thr_bar->leaf_kids) {
903  // First, wait for leaf children to check-in on my b_arrived flag
904  kmp_uint64 leaf_state =
905  KMP_MASTER_TID(tid)
906  ? thr_bar->b_arrived | thr_bar->leaf_state
907  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
908  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
909  "for leaf kids\n",
910  gtid, team->t.t_id, tid));
911  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
912  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
913  if (reduce) {
914  ANNOTATE_REDUCE_AFTER(reduce);
915  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
916  ++child_tid) {
917  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
918  "T#%d(%d:%d)\n",
919  gtid, team->t.t_id, tid,
920  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
921  child_tid));
922  ANNOTATE_BARRIER_END(other_threads[child_tid]);
923  (*reduce)(this_thr->th.th_local.reduce_data,
924  other_threads[child_tid]->th.th_local.reduce_data);
925  }
926  ANNOTATE_REDUCE_BEFORE(reduce);
927  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
928  }
929  // clear leaf_state bits
930  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
931  }
932  // Next, wait for higher level children on each child's b_arrived flag
933  for (kmp_uint32 d = 1; d < thr_bar->my_level;
934  ++d) { // gather lowest level threads first, but skip 0
935  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
936  skip = thr_bar->skip_per_level[d];
937  if (last > nproc)
938  last = nproc;
939  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
940  kmp_info_t *child_thr = other_threads[child_tid];
941  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
942  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
943  "T#%d(%d:%d) "
944  "arrived(%p) == %llu\n",
945  gtid, team->t.t_id, tid,
946  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
947  child_tid, &child_bar->b_arrived, new_state));
948  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
949  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
950  ANNOTATE_BARRIER_END(child_thr);
951  if (reduce) {
952  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
953  "T#%d(%d:%d)\n",
954  gtid, team->t.t_id, tid,
955  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
956  child_tid));
957  ANNOTATE_REDUCE_AFTER(reduce);
958  (*reduce)(this_thr->th.th_local.reduce_data,
959  child_thr->th.th_local.reduce_data);
960  ANNOTATE_REDUCE_BEFORE(reduce);
961  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
962  }
963  }
964  }
965  } else { // Blocktime is not infinite
966  for (kmp_uint32 d = 0; d < thr_bar->my_level;
967  ++d) { // Gather lowest level threads first
968  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
969  skip = thr_bar->skip_per_level[d];
970  if (last > nproc)
971  last = nproc;
972  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
973  kmp_info_t *child_thr = other_threads[child_tid];
974  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
975  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
976  "T#%d(%d:%d) "
977  "arrived(%p) == %llu\n",
978  gtid, team->t.t_id, tid,
979  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
980  child_tid, &child_bar->b_arrived, new_state));
981  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
982  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
983  ANNOTATE_BARRIER_END(child_thr);
984  if (reduce) {
985  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
986  "T#%d(%d:%d)\n",
987  gtid, team->t.t_id, tid,
988  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
989  child_tid));
990  ANNOTATE_REDUCE_AFTER(reduce);
991  (*reduce)(this_thr->th.th_local.reduce_data,
992  child_thr->th.th_local.reduce_data);
993  ANNOTATE_REDUCE_BEFORE(reduce);
994  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
995  }
996  }
997  }
998  }
999  }
1000  // All subordinates are gathered; now release parent if not master thread
1001 
1002  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1003  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1004  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1005  gtid, team->t.t_id, tid,
1006  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1007  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1008  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1009  /* Mark arrival to parent: After performing this write, a worker thread may
1010  not assume that the team is valid any more - it could be deallocated by
1011  the master thread at any time. */
1012  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1013  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1014  // flag; release it
1015  ANNOTATE_BARRIER_BEGIN(this_thr);
1016  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1017  flag.release();
1018  } else {
1019  // Leaf does special release on "offset" bits of parent's b_arrived flag
1020  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1021  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1022  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1023  flag.release();
1024  }
1025  } else { // Master thread needs to update the team's b_arrived value
1026  team->t.t_bar[bt].b_arrived = new_state;
1027  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1028  "arrived(%p) = %llu\n",
1029  gtid, team->t.t_id, tid, team->t.t_id,
1030  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1031  }
1032  // Is the team access below unsafe or just technically invalid?
1033  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1034  "barrier type %d\n",
1035  gtid, team->t.t_id, tid, bt));
1036 }
1037 
1038 static void __kmp_hierarchical_barrier_release(
1039  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1040  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1041  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1042  kmp_team_t *team;
1043  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1044  kmp_uint32 nproc;
1045  bool team_change = false; // indicates on-core barrier shouldn't be used
1046 
1047  if (KMP_MASTER_TID(tid)) {
1048  team = __kmp_threads[gtid]->th.th_team;
1049  KMP_DEBUG_ASSERT(team != NULL);
1050  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1051  "entered barrier type %d\n",
1052  gtid, team->t.t_id, tid, bt));
1053  } else { // Worker threads
1054  // Wait for parent thread to release me
1055  if (!thr_bar->use_oncore_barrier ||
1056  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1057  thr_bar->team == NULL) {
1058  // Use traditional method of waiting on my own b_go flag
1059  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1060  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1061  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1062  ANNOTATE_BARRIER_END(this_thr);
1063  TCW_8(thr_bar->b_go,
1064  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1065  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1066  // infinite, not nested
1067  // Wait on my "offset" bits on parent's b_go flag
1068  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1069  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1070  thr_bar->offset, bt,
1071  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1072  flag.wait(this_thr, TRUE);
1073  if (thr_bar->wait_flag ==
1074  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1075  TCW_8(thr_bar->b_go,
1076  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1077  } else { // Reset my bits on parent's b_go flag
1078  (RCAST(volatile char *,
1079  &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1080  }
1081  }
1082  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1083  // Early exit for reaping threads releasing forkjoin barrier
1084  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1085  return;
1086  // The worker thread may now assume that the team is valid.
1087  team = __kmp_threads[gtid]->th.th_team;
1088  KMP_DEBUG_ASSERT(team != NULL);
1089  tid = __kmp_tid_from_gtid(gtid);
1090 
1091  KA_TRACE(
1092  20,
1093  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1094  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1095  KMP_MB(); // Flush all pending memory write invalidates.
1096  }
1097 
1098  nproc = this_thr->th.th_team_nproc;
1099  int level = team->t.t_level;
1100  if (team->t.t_threads[0]
1101  ->th.th_teams_microtask) { // are we inside the teams construct?
1102  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1103  this_thr->th.th_teams_level == level)
1104  ++level; // level was not increased in teams construct for team_of_workers
1105  if (this_thr->th.th_teams_size.nteams > 1)
1106  ++level; // level was not increased in teams construct for team_of_masters
1107  }
1108  if (level == 1)
1109  thr_bar->use_oncore_barrier = 1;
1110  else
1111  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1112 
1113  // If the team size has increased, we still communicate with old leaves via
1114  // oncore barrier.
1115  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1116  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1117  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1118  tid, team);
1119  // But if the entire team changes, we won't use oncore barrier at all
1120  if (team_change)
1121  old_leaf_kids = 0;
1122 
1123 #if KMP_BARRIER_ICV_PUSH
1124  if (propagate_icvs) {
1125  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1126  FALSE);
1127  if (KMP_MASTER_TID(
1128  tid)) { // master already has copy in final destination; copy
1129  copy_icvs(&thr_bar->th_fixed_icvs,
1130  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1131  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1132  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1133  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1134  // leaves (on-core children) pull parent's fixed ICVs directly to local
1135  // ICV store
1136  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1137  &thr_bar->parent_bar->th_fixed_icvs);
1138  // non-leaves will get ICVs piggybacked with b_go via NGO store
1139  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1140  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1141  // access
1142  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1143  else // leaves copy parent's fixed ICVs directly to local ICV store
1144  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1145  &thr_bar->parent_bar->th_fixed_icvs);
1146  }
1147  }
1148 #endif // KMP_BARRIER_ICV_PUSH
1149 
1150  // Now, release my children
1151  if (thr_bar->my_level) { // not a leaf
1152  kmp_int32 child_tid;
1153  kmp_uint32 last;
1154  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1155  thr_bar->use_oncore_barrier) {
1156  if (KMP_MASTER_TID(tid)) { // do a flat release
1157  // Set local b_go to bump children via NGO store of the cache line
1158  // containing IVCs and b_go.
1159  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1160  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1161  // the cache line
1162  ngo_load(&thr_bar->th_fixed_icvs);
1163  // This loops over all the threads skipping only the leaf nodes in the
1164  // hierarchy
1165  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1166  child_tid += thr_bar->skip_per_level[1]) {
1167  kmp_bstate_t *child_bar =
1168  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1169  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1170  "releasing T#%d(%d:%d)"
1171  " go(%p): %u => %u\n",
1172  gtid, team->t.t_id, tid,
1173  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1174  child_tid, &child_bar->b_go, child_bar->b_go,
1175  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1176  // Use ngo store (if available) to both store ICVs and release child
1177  // via child's b_go
1178  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1179  }
1180  ngo_sync();
1181  }
1182  TCW_8(thr_bar->b_go,
1183  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1184  // Now, release leaf children
1185  if (thr_bar->leaf_kids) { // if there are any
1186  // We test team_change on the off-chance that the level 1 team changed.
1187  if (team_change ||
1188  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1189  if (old_leaf_kids) { // release old leaf kids
1190  thr_bar->b_go |= old_leaf_state;
1191  }
1192  // Release new leaf kids
1193  last = tid + thr_bar->skip_per_level[1];
1194  if (last > nproc)
1195  last = nproc;
1196  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1197  ++child_tid) { // skip_per_level[0]=1
1198  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1199  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1200  KA_TRACE(
1201  20,
1202  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1203  " T#%d(%d:%d) go(%p): %u => %u\n",
1204  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1205  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1206  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1207  // Release child using child's b_go flag
1208  ANNOTATE_BARRIER_BEGIN(child_thr);
1209  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1210  flag.release();
1211  }
1212  } else { // Release all children at once with leaf_state bits on my own
1213  // b_go flag
1214  thr_bar->b_go |= thr_bar->leaf_state;
1215  }
1216  }
1217  } else { // Blocktime is not infinite; do a simple hierarchical release
1218  for (int d = thr_bar->my_level - 1; d >= 0;
1219  --d) { // Release highest level threads first
1220  last = tid + thr_bar->skip_per_level[d + 1];
1221  kmp_uint32 skip = thr_bar->skip_per_level[d];
1222  if (last > nproc)
1223  last = nproc;
1224  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1225  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1226  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1227  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1228  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1229  gtid, team->t.t_id, tid,
1230  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1231  child_tid, &child_bar->b_go, child_bar->b_go,
1232  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1233  // Release child using child's b_go flag
1234  ANNOTATE_BARRIER_BEGIN(child_thr);
1235  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1236  flag.release();
1237  }
1238  }
1239  }
1240 #if KMP_BARRIER_ICV_PUSH
1241  if (propagate_icvs && !KMP_MASTER_TID(tid))
1242  // non-leaves copy ICVs from fixed ICVs to local dest
1243  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1244  &thr_bar->th_fixed_icvs);
1245 #endif // KMP_BARRIER_ICV_PUSH
1246  }
1247  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1248  "barrier type %d\n",
1249  gtid, team->t.t_id, tid, bt));
1250 }
1251 
1252 // End of Barrier Algorithms
1253 
1254 // type traits for cancellable value
1255 // if cancellable is true, then is_cancellable is a normal boolean variable
1256 // if cancellable is false, then is_cancellable is a compile time constant
1257 template <bool cancellable> struct is_cancellable {};
1258 template <> struct is_cancellable<true> {
1259  bool value;
1260  is_cancellable() : value(false) {}
1261  is_cancellable(bool b) : value(b) {}
1262  is_cancellable &operator=(bool b) {
1263  value = b;
1264  return *this;
1265  }
1266  operator bool() const { return value; }
1267 };
1268 template <> struct is_cancellable<false> {
1269  is_cancellable &operator=(bool b) { return *this; }
1270  constexpr operator bool() const { return false; }
1271 };
1272 
1273 // Internal function to do a barrier.
1274 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1275  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1276  barrier
1277  When cancellable = false,
1278  Returns 0 if master thread, 1 if worker thread.
1279  When cancellable = true
1280  Returns 0 if not cancelled, 1 if cancelled. */
1281 template <bool cancellable = false>
1282 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1283  size_t reduce_size, void *reduce_data,
1284  void (*reduce)(void *, void *)) {
1285  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1286  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1287  int tid = __kmp_tid_from_gtid(gtid);
1288  kmp_info_t *this_thr = __kmp_threads[gtid];
1289  kmp_team_t *team = this_thr->th.th_team;
1290  int status = 0;
1291  is_cancellable<cancellable> cancelled;
1292 #if OMPT_SUPPORT && OMPT_OPTIONAL
1293  ompt_data_t *my_task_data;
1294  ompt_data_t *my_parallel_data;
1295  void *return_address;
1296  ompt_sync_region_t barrier_kind;
1297 #endif
1298 
1299  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1300  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1301 
1302  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1303 #if OMPT_SUPPORT
1304  if (ompt_enabled.enabled) {
1305 #if OMPT_OPTIONAL
1306  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1307  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1308  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1309  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1310  if (ompt_enabled.ompt_callback_sync_region) {
1311  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1312  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1313  return_address);
1314  }
1315  if (ompt_enabled.ompt_callback_sync_region_wait) {
1316  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1317  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1318  return_address);
1319  }
1320 #endif
1321  // It is OK to report the barrier state after the barrier begin callback.
1322  // According to the OMPT specification, a compliant implementation may
1323  // even delay reporting this state until the barrier begins to wait.
1324  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1325  }
1326 #endif
1327 
1328  if (!team->t.t_serialized) {
1329 #if USE_ITT_BUILD
1330  // This value will be used in itt notify events below.
1331  void *itt_sync_obj = NULL;
1332 #if USE_ITT_NOTIFY
1333  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1334  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1335 #endif
1336 #endif /* USE_ITT_BUILD */
1337  if (__kmp_tasking_mode == tskm_extra_barrier) {
1338  __kmp_tasking_barrier(team, this_thr, gtid);
1339  KA_TRACE(15,
1340  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1341  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1342  }
1343 
1344  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1345  access it when the team struct is not guaranteed to exist. */
1346  // See note about the corresponding code in __kmp_join_barrier() being
1347  // performance-critical.
1348  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1349 #if KMP_USE_MONITOR
1350  this_thr->th.th_team_bt_intervals =
1351  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1352  this_thr->th.th_team_bt_set =
1353  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1354 #else
1355  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1356 #endif
1357  }
1358 
1359 #if USE_ITT_BUILD
1360  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1361  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1362 #endif /* USE_ITT_BUILD */
1363 #if USE_DEBUGGER
1364  // Let the debugger know: the thread arrived to the barrier and waiting.
1365  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1366  team->t.t_bar[bt].b_master_arrived += 1;
1367  } else {
1368  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1369  } // if
1370 #endif /* USE_DEBUGGER */
1371  if (reduce != NULL) {
1372  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1373  this_thr->th.th_local.reduce_data = reduce_data;
1374  }
1375 
1376  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1377  // use 0 to only setup the current team if nthreads > 1
1378  __kmp_task_team_setup(this_thr, team, 0);
1379 
1380  if (cancellable) {
1381  cancelled = __kmp_linear_barrier_gather_cancellable(
1382  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1383  } else {
1384  switch (__kmp_barrier_gather_pattern[bt]) {
1385  case bp_hyper_bar: {
1386  // don't set branch bits to 0; use linear
1387  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1388  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1389  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1390  break;
1391  }
1392  case bp_hierarchical_bar: {
1393  __kmp_hierarchical_barrier_gather(
1394  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1395  break;
1396  }
1397  case bp_tree_bar: {
1398  // don't set branch bits to 0; use linear
1399  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1401  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1402  break;
1403  }
1404  default: {
1405  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1406  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1407  }
1408  }
1409  }
1410 
1411  KMP_MB();
1412 
1413  if (KMP_MASTER_TID(tid)) {
1414  status = 0;
1415  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1416  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1417  }
1418 #if USE_DEBUGGER
1419  // Let the debugger know: All threads are arrived and starting leaving the
1420  // barrier.
1421  team->t.t_bar[bt].b_team_arrived += 1;
1422 #endif
1423 
1424  if (__kmp_omp_cancellation) {
1425  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1426  // Reset cancellation flag for worksharing constructs
1427  if (cancel_request == cancel_loop ||
1428  cancel_request == cancel_sections) {
1429  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1430  }
1431  }
1432 #if USE_ITT_BUILD
1433  /* TODO: In case of split reduction barrier, master thread may send
1434  acquired event early, before the final summation into the shared
1435  variable is done (final summation can be a long operation for array
1436  reductions). */
1437  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1438  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1439 #endif /* USE_ITT_BUILD */
1440 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1441  // Barrier - report frame end (only if active_level == 1)
1442  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1443  __kmp_forkjoin_frames_mode &&
1444  this_thr->th.th_teams_microtask == NULL &&
1445  team->t.t_active_level == 1) {
1446  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1447  kmp_uint64 cur_time = __itt_get_timestamp();
1448  kmp_info_t **other_threads = team->t.t_threads;
1449  int nproc = this_thr->th.th_team_nproc;
1450  int i;
1451  switch (__kmp_forkjoin_frames_mode) {
1452  case 1:
1453  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1454  loc, nproc);
1455  this_thr->th.th_frame_time = cur_time;
1456  break;
1457  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1458  // be fixed)
1459  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1460  1, loc, nproc);
1461  break;
1462  case 3:
1463  if (__itt_metadata_add_ptr) {
1464  // Initialize with master's wait time
1465  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1466  // Set arrive time to zero to be able to check it in
1467  // __kmp_invoke_task(); the same is done inside the loop below
1468  this_thr->th.th_bar_arrive_time = 0;
1469  for (i = 1; i < nproc; ++i) {
1470  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1471  other_threads[i]->th.th_bar_arrive_time = 0;
1472  }
1473  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1474  cur_time, delta,
1475  (kmp_uint64)(reduce != NULL));
1476  }
1477  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1478  loc, nproc);
1479  this_thr->th.th_frame_time = cur_time;
1480  break;
1481  }
1482  }
1483 #endif /* USE_ITT_BUILD */
1484  } else {
1485  status = 1;
1486 #if USE_ITT_BUILD
1487  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1488  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1489 #endif /* USE_ITT_BUILD */
1490  }
1491  if ((status == 1 || !is_split) && !cancelled) {
1492  if (cancellable) {
1493  cancelled = __kmp_linear_barrier_release_cancellable(
1494  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1495  } else {
1496  switch (__kmp_barrier_release_pattern[bt]) {
1497  case bp_hyper_bar: {
1498  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1499  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1500  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1501  break;
1502  }
1503  case bp_hierarchical_bar: {
1504  __kmp_hierarchical_barrier_release(
1505  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1506  break;
1507  }
1508  case bp_tree_bar: {
1509  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1510  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1511  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1512  break;
1513  }
1514  default: {
1515  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1516  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1517  }
1518  }
1519  }
1520  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1521  __kmp_task_team_sync(this_thr, team);
1522  }
1523  }
1524 
1525 #if USE_ITT_BUILD
1526  /* GEH: TODO: Move this under if-condition above and also include in
1527  __kmp_end_split_barrier(). This will more accurately represent the actual
1528  release time of the threads for split barriers. */
1529  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1530  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1531 #endif /* USE_ITT_BUILD */
1532  } else { // Team is serialized.
1533  status = 0;
1534  if (__kmp_tasking_mode != tskm_immediate_exec) {
1535  if (this_thr->th.th_task_team != NULL) {
1536 #if USE_ITT_NOTIFY
1537  void *itt_sync_obj = NULL;
1538  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1539  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1540  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1541  }
1542 #endif
1543 
1544  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1545  TRUE);
1546  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1547  __kmp_task_team_setup(this_thr, team, 0);
1548 
1549 #if USE_ITT_BUILD
1550  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1551  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1552 #endif /* USE_ITT_BUILD */
1553  }
1554  }
1555  }
1556  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1557  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1558  __kmp_tid_from_gtid(gtid), status));
1559 
1560 #if OMPT_SUPPORT
1561  if (ompt_enabled.enabled) {
1562 #if OMPT_OPTIONAL
1563  if (ompt_enabled.ompt_callback_sync_region_wait) {
1564  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1565  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1566  return_address);
1567  }
1568  if (ompt_enabled.ompt_callback_sync_region) {
1569  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1570  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1571  return_address);
1572  }
1573 #endif
1574  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1575  }
1576 #endif
1577  ANNOTATE_BARRIER_END(&team->t.t_bar);
1578 
1579  if (cancellable)
1580  return (int)cancelled;
1581  return status;
1582 }
1583 
1584 // Returns 0 if master thread, 1 if worker thread.
1585 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1586  size_t reduce_size, void *reduce_data,
1587  void (*reduce)(void *, void *)) {
1588  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1589  reduce);
1590 }
1591 
1592 #if defined(KMP_GOMP_COMPAT)
1593 // Returns 1 if cancelled, 0 otherwise
1594 int __kmp_barrier_gomp_cancel(int gtid) {
1595  if (__kmp_omp_cancellation) {
1596  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1597  0, NULL, NULL);
1598  if (cancelled) {
1599  int tid = __kmp_tid_from_gtid(gtid);
1600  kmp_info_t *this_thr = __kmp_threads[gtid];
1601  if (KMP_MASTER_TID(tid)) {
1602  // Master does not need to revert anything
1603  } else {
1604  // Workers need to revert their private b_arrived flag
1605  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1606  KMP_BARRIER_STATE_BUMP;
1607  }
1608  }
1609  return cancelled;
1610  }
1611  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1612  return FALSE;
1613 }
1614 #endif
1615 
1616 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1617  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1618  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1619  int tid = __kmp_tid_from_gtid(gtid);
1620  kmp_info_t *this_thr = __kmp_threads[gtid];
1621  kmp_team_t *team = this_thr->th.th_team;
1622 
1623  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1624  if (!team->t.t_serialized) {
1625  if (KMP_MASTER_GTID(gtid)) {
1626  switch (__kmp_barrier_release_pattern[bt]) {
1627  case bp_hyper_bar: {
1628  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1629  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1630  FALSE USE_ITT_BUILD_ARG(NULL));
1631  break;
1632  }
1633  case bp_hierarchical_bar: {
1634  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1635  FALSE USE_ITT_BUILD_ARG(NULL));
1636  break;
1637  }
1638  case bp_tree_bar: {
1639  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1640  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1641  FALSE USE_ITT_BUILD_ARG(NULL));
1642  break;
1643  }
1644  default: {
1645  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1646  FALSE USE_ITT_BUILD_ARG(NULL));
1647  }
1648  }
1649  if (__kmp_tasking_mode != tskm_immediate_exec) {
1650  __kmp_task_team_sync(this_thr, team);
1651  } // if
1652  }
1653  }
1654  ANNOTATE_BARRIER_END(&team->t.t_bar);
1655 }
1656 
1657 void __kmp_join_barrier(int gtid) {
1658  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1659  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1660  kmp_info_t *this_thr = __kmp_threads[gtid];
1661  kmp_team_t *team;
1662  kmp_uint nproc;
1663  kmp_info_t *master_thread;
1664  int tid;
1665 #ifdef KMP_DEBUG
1666  int team_id;
1667 #endif /* KMP_DEBUG */
1668 #if USE_ITT_BUILD
1669  void *itt_sync_obj = NULL;
1670 #if USE_ITT_NOTIFY
1671  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1672  // Get object created at fork_barrier
1673  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1674 #endif
1675 #endif /* USE_ITT_BUILD */
1676  KMP_MB();
1677 
1678  // Get current info
1679  team = this_thr->th.th_team;
1680  nproc = this_thr->th.th_team_nproc;
1681  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1682  tid = __kmp_tid_from_gtid(gtid);
1683 #ifdef KMP_DEBUG
1684  team_id = team->t.t_id;
1685 #endif /* KMP_DEBUG */
1686  master_thread = this_thr->th.th_team_master;
1687 #ifdef KMP_DEBUG
1688  if (master_thread != team->t.t_threads[0]) {
1689  __kmp_print_structure();
1690  }
1691 #endif /* KMP_DEBUG */
1692  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1693  KMP_MB();
1694 
1695  // Verify state
1696  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1697  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1698  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1699  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1700  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1701  gtid, team_id, tid));
1702 
1703  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1704 #if OMPT_SUPPORT
1705  if (ompt_enabled.enabled) {
1706 #if OMPT_OPTIONAL
1707  ompt_data_t *my_task_data;
1708  ompt_data_t *my_parallel_data;
1709  void *codeptr = NULL;
1710  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1711  if (KMP_MASTER_TID(ds_tid) &&
1712  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1713  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1714  codeptr = team->t.ompt_team_info.master_return_address;
1715  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1716  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1717  if (ompt_enabled.ompt_callback_sync_region) {
1718  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1719  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1720  my_task_data, codeptr);
1721  }
1722  if (ompt_enabled.ompt_callback_sync_region_wait) {
1723  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1724  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1725  my_task_data, codeptr);
1726  }
1727  if (!KMP_MASTER_TID(ds_tid))
1728  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1729 #endif
1730  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1731  }
1732 #endif
1733 
1734  if (__kmp_tasking_mode == tskm_extra_barrier) {
1735  __kmp_tasking_barrier(team, this_thr, gtid);
1736  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1737  team_id, tid));
1738  }
1739 #ifdef KMP_DEBUG
1740  if (__kmp_tasking_mode != tskm_immediate_exec) {
1741  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1742  "%p, th_task_team = %p\n",
1743  __kmp_gtid_from_thread(this_thr), team_id,
1744  team->t.t_task_team[this_thr->th.th_task_state],
1745  this_thr->th.th_task_team));
1746  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1747  team->t.t_task_team[this_thr->th.th_task_state]);
1748  }
1749 #endif /* KMP_DEBUG */
1750 
1751  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1752  access it when the team struct is not guaranteed to exist. Doing these
1753  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1754  we do not perform the copy if blocktime=infinite, since the values are not
1755  used by __kmp_wait_template() in that case. */
1756  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1757 #if KMP_USE_MONITOR
1758  this_thr->th.th_team_bt_intervals =
1759  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1760  this_thr->th.th_team_bt_set =
1761  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1762 #else
1763  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1764 #endif
1765  }
1766 
1767 #if USE_ITT_BUILD
1768  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1769  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1770 #endif /* USE_ITT_BUILD */
1771 
1772  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1773  case bp_hyper_bar: {
1774  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1775  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1776  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1777  break;
1778  }
1779  case bp_hierarchical_bar: {
1780  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1781  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1782  break;
1783  }
1784  case bp_tree_bar: {
1785  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1786  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1787  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1788  break;
1789  }
1790  default: {
1791  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1792  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1793  }
1794  }
1795 
1796  /* From this point on, the team data structure may be deallocated at any time
1797  by the master thread - it is unsafe to reference it in any of the worker
1798  threads. Any per-team data items that need to be referenced before the
1799  end of the barrier should be moved to the kmp_task_team_t structs. */
1800  if (KMP_MASTER_TID(tid)) {
1801  if (__kmp_tasking_mode != tskm_immediate_exec) {
1802  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1803  }
1804  if (__kmp_display_affinity) {
1805  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1806  }
1807 #if KMP_STATS_ENABLED
1808  // Have master thread flag the workers to indicate they are now waiting for
1809  // next parallel region, Also wake them up so they switch their timers to
1810  // idle.
1811  for (int i = 0; i < team->t.t_nproc; ++i) {
1812  kmp_info_t *team_thread = team->t.t_threads[i];
1813  if (team_thread == this_thr)
1814  continue;
1815  team_thread->th.th_stats->setIdleFlag();
1816  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1817  team_thread->th.th_sleep_loc != NULL)
1818  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1819  team_thread->th.th_sleep_loc);
1820  }
1821 #endif
1822 #if USE_ITT_BUILD
1823  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1824  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1825 #endif /* USE_ITT_BUILD */
1826 
1827 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1828  // Join barrier - report frame end
1829  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1830  __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
1831  team->t.t_active_level == 1) {
1832  kmp_uint64 cur_time = __itt_get_timestamp();
1833  ident_t *loc = team->t.t_ident;
1834  kmp_info_t **other_threads = team->t.t_threads;
1835  int nproc = this_thr->th.th_team_nproc;
1836  int i;
1837  switch (__kmp_forkjoin_frames_mode) {
1838  case 1:
1839  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1840  loc, nproc);
1841  break;
1842  case 2:
1843  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1844  loc, nproc);
1845  break;
1846  case 3:
1847  if (__itt_metadata_add_ptr) {
1848  // Initialize with master's wait time
1849  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1850  // Set arrive time to zero to be able to check it in
1851  // __kmp_invoke_task(); the same is done inside the loop below
1852  this_thr->th.th_bar_arrive_time = 0;
1853  for (i = 1; i < nproc; ++i) {
1854  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1855  other_threads[i]->th.th_bar_arrive_time = 0;
1856  }
1857  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1858  cur_time, delta, 0);
1859  }
1860  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1861  loc, nproc);
1862  this_thr->th.th_frame_time = cur_time;
1863  break;
1864  }
1865  }
1866 #endif /* USE_ITT_BUILD */
1867  }
1868 #if USE_ITT_BUILD
1869  else {
1870  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1871  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1872  }
1873 #endif /* USE_ITT_BUILD */
1874 
1875 #if KMP_DEBUG
1876  if (KMP_MASTER_TID(tid)) {
1877  KA_TRACE(
1878  15,
1879  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1880  gtid, team_id, tid, nproc));
1881  }
1882 #endif /* KMP_DEBUG */
1883 
1884  // TODO now, mark worker threads as done so they may be disbanded
1885  KMP_MB(); // Flush all pending memory write invalidates.
1886  KA_TRACE(10,
1887  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1888 
1889  ANNOTATE_BARRIER_END(&team->t.t_bar);
1890 }
1891 
1892 // TODO release worker threads' fork barriers as we are ready instead of all at
1893 // once
1894 void __kmp_fork_barrier(int gtid, int tid) {
1895  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1896  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1897  kmp_info_t *this_thr = __kmp_threads[gtid];
1898  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1899 #if USE_ITT_BUILD
1900  void *itt_sync_obj = NULL;
1901 #endif /* USE_ITT_BUILD */
1902  if (team)
1903  ANNOTATE_BARRIER_END(&team->t.t_bar);
1904 
1905  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1906  (team != NULL) ? team->t.t_id : -1, tid));
1907 
1908  // th_team pointer only valid for master thread here
1909  if (KMP_MASTER_TID(tid)) {
1910 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1911  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1912  // Create itt barrier object
1913  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1914  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1915  }
1916 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1917 
1918 #ifdef KMP_DEBUG
1919  kmp_info_t **other_threads = team->t.t_threads;
1920  int i;
1921 
1922  // Verify state
1923  KMP_MB();
1924 
1925  for (i = 1; i < team->t.t_nproc; ++i) {
1926  KA_TRACE(500,
1927  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1928  "== %u.\n",
1929  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1930  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1931  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1932  KMP_DEBUG_ASSERT(
1933  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1934  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1935  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1936  }
1937 #endif
1938 
1939  if (__kmp_tasking_mode != tskm_immediate_exec) {
1940  // 0 indicates setup current task team if nthreads > 1
1941  __kmp_task_team_setup(this_thr, team, 0);
1942  }
1943 
1944  /* The master thread may have changed its blocktime between the join barrier
1945  and the fork barrier. Copy the blocktime info to the thread, where
1946  __kmp_wait_template() can access it when the team struct is not
1947  guaranteed to exist. */
1948  // See note about the corresponding code in __kmp_join_barrier() being
1949  // performance-critical
1950  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1951 #if KMP_USE_MONITOR
1952  this_thr->th.th_team_bt_intervals =
1953  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1954  this_thr->th.th_team_bt_set =
1955  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1956 #else
1957  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1958 #endif
1959  }
1960  } // master
1961 
1962  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1963  case bp_hyper_bar: {
1964  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1965  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1966  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1967  break;
1968  }
1969  case bp_hierarchical_bar: {
1970  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1971  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1972  break;
1973  }
1974  case bp_tree_bar: {
1975  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1976  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1977  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1978  break;
1979  }
1980  default: {
1981  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1982  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1983  }
1984  }
1985 
1986 #if OMPT_SUPPORT
1987  if (ompt_enabled.enabled &&
1988  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1989  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1990  ompt_data_t *task_data = (team)
1991  ? OMPT_CUR_TASK_DATA(this_thr)
1992  : &(this_thr->th.ompt_thread_info.task_data);
1993  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1994 #if OMPT_OPTIONAL
1995  void *codeptr = NULL;
1996  if (KMP_MASTER_TID(ds_tid) &&
1997  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1998  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1999  codeptr = team->t.ompt_team_info.master_return_address;
2000  if (ompt_enabled.ompt_callback_sync_region_wait) {
2001  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2002  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2003  codeptr);
2004  }
2005  if (ompt_enabled.ompt_callback_sync_region) {
2006  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2007  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2008  codeptr);
2009  }
2010 #endif
2011  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2012  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2013  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2014  }
2015  }
2016 #endif
2017 
2018  // Early exit for reaping threads releasing forkjoin barrier
2019  if (TCR_4(__kmp_global.g.g_done)) {
2020  this_thr->th.th_task_team = NULL;
2021 
2022 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2023  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2024  if (!KMP_MASTER_TID(tid)) {
2025  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2026  if (itt_sync_obj)
2027  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2028  }
2029  }
2030 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2031  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2032  return;
2033  }
2034 
2035  /* We can now assume that a valid team structure has been allocated by the
2036  master and propagated to all worker threads. The current thread, however,
2037  may not be part of the team, so we can't blindly assume that the team
2038  pointer is non-null. */
2039  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2040  KMP_DEBUG_ASSERT(team != NULL);
2041  tid = __kmp_tid_from_gtid(gtid);
2042 
2043 #if KMP_BARRIER_ICV_PULL
2044  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2045  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2046  implicit task has this data before this function is called. We cannot
2047  modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2048  struct, because it is not always the case that the threads arrays have
2049  been allocated when __kmp_fork_call() is executed. */
2050  {
2051  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2052  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2053  // Copy the initial ICVs from the master's thread struct to the implicit
2054  // task for this tid.
2055  KA_TRACE(10,
2056  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2057  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2058  tid, FALSE);
2059  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2060  &team->t.t_threads[0]
2061  ->th.th_bar[bs_forkjoin_barrier]
2062  .bb.th_fixed_icvs);
2063  }
2064  }
2065 #endif // KMP_BARRIER_ICV_PULL
2066 
2067  if (__kmp_tasking_mode != tskm_immediate_exec) {
2068  __kmp_task_team_sync(this_thr, team);
2069  }
2070 
2071 #if KMP_AFFINITY_SUPPORTED
2072  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2073  if (proc_bind == proc_bind_intel) {
2074  // Call dynamic affinity settings
2075  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2076  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2077  }
2078  } else if (proc_bind != proc_bind_false) {
2079  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2080  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2081  __kmp_gtid_from_thread(this_thr),
2082  this_thr->th.th_current_place));
2083  } else {
2084  __kmp_affinity_set_place(gtid);
2085  }
2086  }
2087 #endif // KMP_AFFINITY_SUPPORTED
2088  // Perform the display affinity functionality
2089  if (__kmp_display_affinity) {
2090  if (team->t.t_display_affinity
2091 #if KMP_AFFINITY_SUPPORTED
2092  || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2093 #endif
2094  ) {
2095  // NULL means use the affinity-format-var ICV
2096  __kmp_aux_display_affinity(gtid, NULL);
2097  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2098  this_thr->th.th_prev_level = team->t.t_level;
2099  }
2100  }
2101  if (!KMP_MASTER_TID(tid))
2102  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2103 
2104 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2105  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2106  if (!KMP_MASTER_TID(tid)) {
2107  // Get correct barrier object
2108  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2109  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2110  } // (prepare called inside barrier_release)
2111  }
2112 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2113  ANNOTATE_BARRIER_END(&team->t.t_bar);
2114  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2115  team->t.t_id, tid));
2116 }
2117 
2118 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2119  kmp_internal_control_t *new_icvs, ident_t *loc) {
2120  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2121 
2122  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2123  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2124 
2125 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2126  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2127  implicit task has this data before this function is called. */
2128 #if KMP_BARRIER_ICV_PULL
2129  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2130  untouched), where all of the worker threads can access them and make their
2131  own copies after the barrier. */
2132  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2133  // allocated at this point
2134  copy_icvs(
2135  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2136  new_icvs);
2137  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2138  team->t.t_threads[0], team));
2139 #elif KMP_BARRIER_ICV_PUSH
2140  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2141  // done here.
2142  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2143  team->t.t_threads[0], team));
2144 #else
2145  // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2146  // time.
2147  ngo_load(new_icvs);
2148  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2149  // allocated at this point
2150  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2151  // TODO: GEH - pass in better source location info since usually NULL here
2152  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2153  f, team->t.t_threads[f], team));
2154  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2155  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2156  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2157  f, team->t.t_threads[f], team));
2158  }
2159  ngo_sync();
2160 #endif // KMP_BARRIER_ICV_PULL
2161 }
Definition: kmp.h:222