LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  * it may change values between parallel regions. __kmp_max_nth
17  * is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43  kmp_info_t *th;
44 
45  KMP_DEBUG_ASSERT(gtid_ref);
46 
47  if (__kmp_env_consistency_check) {
48  th = __kmp_threads[*gtid_ref];
49  if (th->th.th_root->r.r_active &&
50  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56  }
57  }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61  kmp_info_t *th;
62 
63  if (__kmp_env_consistency_check) {
64  th = __kmp_threads[*gtid_ref];
65  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67  }
68  }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(enum sched_type schedule,
73  bool use_hier = false) {
74  // Pick up the nonmonotonic/monotonic bits from the scheduling type
75  int monotonicity;
76  // default to monotonic
77  monotonicity = SCHEDULE_MONOTONIC;
78  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79  monotonicity = SCHEDULE_NONMONOTONIC;
80  else if (SCHEDULE_HAS_MONOTONIC(schedule))
81  monotonicity = SCHEDULE_MONOTONIC;
82  return monotonicity;
83 }
84 
85 // Initialize a dispatch_private_info_template<T> buffer for a particular
86 // type of schedule,chunk. The loop description is found in lb (lower bound),
87 // ub (upper bound), and st (stride). nproc is the number of threads relevant
88 // to the scheduling (often the number of threads in a team, but not always if
89 // hierarchical scheduling is used). tid is the id of the thread calling
90 // the function within the group of nproc threads. It will have a value
91 // between 0 and nproc - 1. This is often just the thread id within a team, but
92 // is not necessarily the case when using hierarchical scheduling.
93 // loc is the source file location of the corresponding loop
94 // gtid is the global thread id
95 template <typename T>
96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97  dispatch_private_info_template<T> *pr,
98  enum sched_type schedule, T lb, T ub,
99  typename traits_t<T>::signed_t st,
100 #if USE_ITT_BUILD
101  kmp_uint64 *cur_chunk,
102 #endif
103  typename traits_t<T>::signed_t chunk,
104  T nproc, T tid) {
105  typedef typename traits_t<T>::unsigned_t UT;
106  typedef typename traits_t<T>::floating_t DBL;
107 
108  int active;
109  T tc;
110  kmp_info_t *th;
111  kmp_team_t *team;
112  int monotonicity;
113  bool use_hier;
114 
115 #ifdef KMP_DEBUG
116  typedef typename traits_t<T>::signed_t ST;
117  {
118  char *buff;
119  // create format specifiers before the debug output
120  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121  "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122  "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123  traits_t<T>::spec, traits_t<T>::spec,
124  traits_t<ST>::spec, traits_t<ST>::spec,
125  traits_t<T>::spec, traits_t<T>::spec);
126  KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127  __kmp_str_free(&buff);
128  }
129 #endif
130  /* setup data */
131  th = __kmp_threads[gtid];
132  team = th->th.th_team;
133  active = !team->t.t_serialized;
134 
135 #if USE_ITT_BUILD
136  int itt_need_metadata_reporting =
137  __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139  team->t.t_active_level == 1;
140 #endif
141 
142 #if KMP_USE_HIER_SCHED
143  use_hier = pr->flags.use_hier;
144 #else
145  use_hier = false;
146 #endif
147 
148  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
149  monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
151 
152  /* Pick up the nomerge/ordered bits from the scheduling type */
153  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
154  pr->flags.nomerge = TRUE;
155  schedule =
156  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
157  } else {
158  pr->flags.nomerge = FALSE;
159  }
160  pr->type_size = traits_t<T>::type_size; // remember the size of variables
161  if (kmp_ord_lower & schedule) {
162  pr->flags.ordered = TRUE;
163  schedule =
164  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
165  } else {
166  pr->flags.ordered = FALSE;
167  }
168  // Ordered overrides nonmonotonic
169  if (pr->flags.ordered) {
170  monotonicity = SCHEDULE_MONOTONIC;
171  }
172 
173  if (schedule == kmp_sch_static) {
174  schedule = __kmp_static;
175  } else {
176  if (schedule == kmp_sch_runtime) {
177  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
178  // not specified)
179  schedule = team->t.t_sched.r_sched_type;
180  monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
182  // Detail the schedule if needed (global controls are differentiated
183  // appropriately)
184  if (schedule == kmp_sch_guided_chunked) {
185  schedule = __kmp_guided;
186  } else if (schedule == kmp_sch_static) {
187  schedule = __kmp_static;
188  }
189  // Use the chunk size specified by OMP_SCHEDULE (or default if not
190  // specified)
191  chunk = team->t.t_sched.chunk;
192 #if USE_ITT_BUILD
193  if (cur_chunk)
194  *cur_chunk = chunk;
195 #endif
196 #ifdef KMP_DEBUG
197  {
198  char *buff;
199  // create format specifiers before the debug output
200  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
201  "schedule:%%d chunk:%%%s\n",
202  traits_t<ST>::spec);
203  KD_TRACE(10, (buff, gtid, schedule, chunk));
204  __kmp_str_free(&buff);
205  }
206 #endif
207  } else {
208  if (schedule == kmp_sch_guided_chunked) {
209  schedule = __kmp_guided;
210  }
211  if (chunk <= 0) {
212  chunk = KMP_DEFAULT_CHUNK;
213  }
214  }
215 
216  if (schedule == kmp_sch_auto) {
217  // mapping and differentiation: in the __kmp_do_serial_initialize()
218  schedule = __kmp_auto;
219 #ifdef KMP_DEBUG
220  {
221  char *buff;
222  // create format specifiers before the debug output
223  buff = __kmp_str_format(
224  "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225  "schedule:%%d chunk:%%%s\n",
226  traits_t<ST>::spec);
227  KD_TRACE(10, (buff, gtid, schedule, chunk));
228  __kmp_str_free(&buff);
229  }
230 #endif
231  }
232 #if KMP_STATIC_STEAL_ENABLED
233  // map nonmonotonic:dynamic to static steal
234  if (schedule == kmp_sch_dynamic_chunked) {
235  if (monotonicity == SCHEDULE_NONMONOTONIC)
236  schedule = kmp_sch_static_steal;
237  }
238 #endif
239  /* guided analytical not safe for too many threads */
240  if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241  schedule = kmp_sch_guided_iterative_chunked;
242  KMP_WARNING(DispatchManyThreads);
243  }
244  if (schedule == kmp_sch_runtime_simd) {
245  // compiler provides simd_width in the chunk parameter
246  schedule = team->t.t_sched.r_sched_type;
247  monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
249  // Detail the schedule if needed (global controls are differentiated
250  // appropriately)
251  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
252  schedule == __kmp_static) {
253  schedule = kmp_sch_static_balanced_chunked;
254  } else {
255  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
256  schedule = kmp_sch_guided_simd;
257  }
258  chunk = team->t.t_sched.chunk * chunk;
259  }
260 #if USE_ITT_BUILD
261  if (cur_chunk)
262  *cur_chunk = chunk;
263 #endif
264 #ifdef KMP_DEBUG
265  {
266  char *buff;
267  // create format specifiers before the debug output
268  buff = __kmp_str_format(
269  "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
270  " chunk:%%%s\n",
271  traits_t<ST>::spec);
272  KD_TRACE(10, (buff, gtid, schedule, chunk));
273  __kmp_str_free(&buff);
274  }
275 #endif
276  }
277  pr->u.p.parm1 = chunk;
278  }
279  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
280  "unknown scheduling type");
281 
282  pr->u.p.count = 0;
283 
284  if (__kmp_env_consistency_check) {
285  if (st == 0) {
286  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287  (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
288  }
289  }
290  // compute trip count
291  if (st == 1) { // most common case
292  if (ub >= lb) {
293  tc = ub - lb + 1;
294  } else { // ub < lb
295  tc = 0; // zero-trip
296  }
297  } else if (st < 0) {
298  if (lb >= ub) {
299  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
300  // where the division needs to be unsigned regardless of the result type
301  tc = (UT)(lb - ub) / (-st) + 1;
302  } else { // lb < ub
303  tc = 0; // zero-trip
304  }
305  } else { // st > 0
306  if (ub >= lb) {
307  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
308  // where the division needs to be unsigned regardless of the result type
309  tc = (UT)(ub - lb) / st + 1;
310  } else { // ub < lb
311  tc = 0; // zero-trip
312  }
313  }
314 
315 #if KMP_STATS_ENABLED
316  if (KMP_MASTER_GTID(gtid)) {
317  KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
318  }
319 #endif
320 
321  pr->u.p.lb = lb;
322  pr->u.p.ub = ub;
323  pr->u.p.st = st;
324  pr->u.p.tc = tc;
325 
326 #if KMP_OS_WINDOWS
327  pr->u.p.last_upper = ub + st;
328 #endif /* KMP_OS_WINDOWS */
329 
330  /* NOTE: only the active parallel region(s) has active ordered sections */
331 
332  if (active) {
333  if (pr->flags.ordered) {
334  pr->ordered_bumped = 0;
335  pr->u.p.ordered_lower = 1;
336  pr->u.p.ordered_upper = 0;
337  }
338  }
339 
340  switch (schedule) {
341 #if (KMP_STATIC_STEAL_ENABLED)
342  case kmp_sch_static_steal: {
343  T ntc, init;
344 
345  KD_TRACE(100,
346  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
347  gtid));
348 
349  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350  if (nproc > 1 && ntc >= nproc) {
351  KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
352  T id = tid;
353  T small_chunk, extras;
354 
355  small_chunk = ntc / nproc;
356  extras = ntc % nproc;
357 
358  init = id * small_chunk + (id < extras ? id : extras);
359  pr->u.p.count = init;
360  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
361 
362  pr->u.p.parm2 = lb;
363  // parm3 is the number of times to attempt stealing which is
364  // proportional to the number of chunks per thread up until
365  // the maximum value of nproc.
366  pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
368  pr->u.p.st = st;
369  if (traits_t<T>::type_size > 4) {
370  // AC: TODO: check if 16-byte CAS available and use it to
371  // improve performance (probably wait for explicit request
372  // before spending time on this).
373  // For now use dynamically allocated per-thread lock,
374  // free memory in __kmp_dispatch_next when status==0.
375  KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
376  th->th.th_dispatch->th_steal_lock =
377  (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
378  __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
379  }
380  break;
381  } else {
382  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
383  "kmp_sch_static_balanced\n",
384  gtid));
385  schedule = kmp_sch_static_balanced;
386  /* too few iterations: fall-through to kmp_sch_static_balanced */
387  } // if
388  /* FALL-THROUGH to static balanced */
389  KMP_FALLTHROUGH();
390  } // case
391 #endif
392  case kmp_sch_static_balanced: {
393  T init, limit;
394 
395  KD_TRACE(
396  100,
397  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
398  gtid));
399 
400  if (nproc > 1) {
401  T id = tid;
402 
403  if (tc < nproc) {
404  if (id < tc) {
405  init = id;
406  limit = id;
407  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
408  } else {
409  pr->u.p.count = 1; /* means no more chunks to execute */
410  pr->u.p.parm1 = FALSE;
411  break;
412  }
413  } else {
414  T small_chunk = tc / nproc;
415  T extras = tc % nproc;
416  init = id * small_chunk + (id < extras ? id : extras);
417  limit = init + small_chunk - (id < extras ? 0 : 1);
418  pr->u.p.parm1 = (id == nproc - 1);
419  }
420  } else {
421  if (tc > 0) {
422  init = 0;
423  limit = tc - 1;
424  pr->u.p.parm1 = TRUE;
425  } else {
426  // zero trip count
427  pr->u.p.count = 1; /* means no more chunks to execute */
428  pr->u.p.parm1 = FALSE;
429  break;
430  }
431  }
432 #if USE_ITT_BUILD
433  // Calculate chunk for metadata report
434  if (itt_need_metadata_reporting)
435  if (cur_chunk)
436  *cur_chunk = limit - init + 1;
437 #endif
438  if (st == 1) {
439  pr->u.p.lb = lb + init;
440  pr->u.p.ub = lb + limit;
441  } else {
442  // calculated upper bound, "ub" is user-defined upper bound
443  T ub_tmp = lb + limit * st;
444  pr->u.p.lb = lb + init * st;
445  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
446  // it exactly
447  if (st > 0) {
448  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
449  } else {
450  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
451  }
452  }
453  if (pr->flags.ordered) {
454  pr->u.p.ordered_lower = init;
455  pr->u.p.ordered_upper = limit;
456  }
457  break;
458  } // case
459  case kmp_sch_static_balanced_chunked: {
460  // similar to balanced, but chunk adjusted to multiple of simd width
461  T nth = nproc;
462  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
463  " -> falling-through to static_greedy\n",
464  gtid));
465  schedule = kmp_sch_static_greedy;
466  if (nth > 1)
467  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
468  else
469  pr->u.p.parm1 = tc;
470  break;
471  } // case
472  case kmp_sch_guided_simd:
473  case kmp_sch_guided_iterative_chunked: {
474  KD_TRACE(
475  100,
476  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
477  " case\n",
478  gtid));
479 
480  if (nproc > 1) {
481  if ((2L * chunk + 1) * nproc >= tc) {
482  /* chunk size too large, switch to dynamic */
483  schedule = kmp_sch_dynamic_chunked;
484  } else {
485  // when remaining iters become less than parm2 - switch to dynamic
486  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
487  *(double *)&pr->u.p.parm3 =
488  guided_flt_param / nproc; // may occupy parm3 and parm4
489  }
490  } else {
491  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
492  "kmp_sch_static_greedy\n",
493  gtid));
494  schedule = kmp_sch_static_greedy;
495  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
496  KD_TRACE(
497  100,
498  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
499  gtid));
500  pr->u.p.parm1 = tc;
501  } // if
502  } // case
503  break;
504  case kmp_sch_guided_analytical_chunked: {
505  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
506  "kmp_sch_guided_analytical_chunked case\n",
507  gtid));
508 
509  if (nproc > 1) {
510  if ((2L * chunk + 1) * nproc >= tc) {
511  /* chunk size too large, switch to dynamic */
512  schedule = kmp_sch_dynamic_chunked;
513  } else {
514  /* commonly used term: (2 nproc - 1)/(2 nproc) */
515  DBL x;
516 
517 #if KMP_USE_X87CONTROL
518  /* Linux* OS already has 64-bit computation by default for long double,
519  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
520  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
521  instead of the default 53-bit. Even though long double doesn't work
522  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
523  expected to impact the correctness of the algorithm, but this has not
524  been mathematically proven. */
525  // save original FPCW and set precision to 64-bit, as
526  // Windows* OS on IA-32 architecture defaults to 53-bit
527  unsigned int oldFpcw = _control87(0, 0);
528  _control87(_PC_64, _MCW_PC); // 0,0x30000
529 #endif
530  /* value used for comparison in solver for cross-over point */
531  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
532 
533  /* crossover point--chunk indexes equal to or greater than
534  this point switch to dynamic-style scheduling */
535  UT cross;
536 
537  /* commonly used term: (2 nproc - 1)/(2 nproc) */
538  x = (long double)1.0 - (long double)0.5 / nproc;
539 
540 #ifdef KMP_DEBUG
541  { // test natural alignment
542  struct _test_a {
543  char a;
544  union {
545  char b;
546  DBL d;
547  };
548  } t;
549  ptrdiff_t natural_alignment =
550  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
551  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
552  // long)natural_alignment );
553  KMP_DEBUG_ASSERT(
554  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
555  }
556 #endif // KMP_DEBUG
557 
558  /* save the term in thread private dispatch structure */
559  *(DBL *)&pr->u.p.parm3 = x;
560 
561  /* solve for the crossover point to the nearest integer i for which C_i
562  <= chunk */
563  {
564  UT left, right, mid;
565  long double p;
566 
567  /* estimate initial upper and lower bound */
568 
569  /* doesn't matter what value right is as long as it is positive, but
570  it affects performance of the solver */
571  right = 229;
572  p = __kmp_pow<UT>(x, right);
573  if (p > target) {
574  do {
575  p *= p;
576  right <<= 1;
577  } while (p > target && right < (1 << 27));
578  /* lower bound is previous (failed) estimate of upper bound */
579  left = right >> 1;
580  } else {
581  left = 0;
582  }
583 
584  /* bisection root-finding method */
585  while (left + 1 < right) {
586  mid = (left + right) / 2;
587  if (__kmp_pow<UT>(x, mid) > target) {
588  left = mid;
589  } else {
590  right = mid;
591  }
592  } // while
593  cross = right;
594  }
595  /* assert sanity of computed crossover point */
596  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
597  __kmp_pow<UT>(x, cross) <= target);
598 
599  /* save the crossover point in thread private dispatch structure */
600  pr->u.p.parm2 = cross;
601 
602 // C75803
603 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
604 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
605 #else
606 #define GUIDED_ANALYTICAL_WORKAROUND (x)
607 #endif
608  /* dynamic-style scheduling offset */
609  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
610  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
611  cross * chunk;
612 #if KMP_USE_X87CONTROL
613  // restore FPCW
614  _control87(oldFpcw, _MCW_PC);
615 #endif
616  } // if
617  } else {
618  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
619  "kmp_sch_static_greedy\n",
620  gtid));
621  schedule = kmp_sch_static_greedy;
622  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
623  pr->u.p.parm1 = tc;
624  } // if
625  } // case
626  break;
627  case kmp_sch_static_greedy:
628  KD_TRACE(
629  100,
630  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
631  gtid));
632  pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
633  break;
634  case kmp_sch_static_chunked:
635  case kmp_sch_dynamic_chunked:
636  if (pr->u.p.parm1 <= 0) {
637  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
638  }
639  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
640  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
641  gtid));
642  break;
643  case kmp_sch_trapezoidal: {
644  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
645 
646  T parm1, parm2, parm3, parm4;
647  KD_TRACE(100,
648  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
649  gtid));
650 
651  parm1 = chunk;
652 
653  /* F : size of the first cycle */
654  parm2 = (tc / (2 * nproc));
655 
656  if (parm2 < 1) {
657  parm2 = 1;
658  }
659 
660  /* L : size of the last cycle. Make sure the last cycle is not larger
661  than the first cycle. */
662  if (parm1 < 1) {
663  parm1 = 1;
664  } else if (parm1 > parm2) {
665  parm1 = parm2;
666  }
667 
668  /* N : number of cycles */
669  parm3 = (parm2 + parm1);
670  parm3 = (2 * tc + parm3 - 1) / parm3;
671 
672  if (parm3 < 2) {
673  parm3 = 2;
674  }
675 
676  /* sigma : decreasing incr of the trapezoid */
677  parm4 = (parm3 - 1);
678  parm4 = (parm2 - parm1) / parm4;
679 
680  // pointless check, because parm4 >= 0 always
681  // if ( parm4 < 0 ) {
682  // parm4 = 0;
683  //}
684 
685  pr->u.p.parm1 = parm1;
686  pr->u.p.parm2 = parm2;
687  pr->u.p.parm3 = parm3;
688  pr->u.p.parm4 = parm4;
689  } // case
690  break;
691 
692  default: {
693  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
694  KMP_HNT(GetNewerLibrary), // Hint
695  __kmp_msg_null // Variadic argument list terminator
696  );
697  } break;
698  } // switch
699  pr->schedule = schedule;
700 }
701 
702 #if KMP_USE_HIER_SCHED
703 template <typename T>
704 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
705  typename traits_t<T>::signed_t st);
706 template <>
707 inline void
708 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
709  kmp_int32 ub, kmp_int32 st) {
710  __kmp_dispatch_init_hierarchy<kmp_int32>(
711  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
712  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
713 }
714 template <>
715 inline void
716 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
717  kmp_uint32 ub, kmp_int32 st) {
718  __kmp_dispatch_init_hierarchy<kmp_uint32>(
719  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
720  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
721 }
722 template <>
723 inline void
724 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
725  kmp_int64 ub, kmp_int64 st) {
726  __kmp_dispatch_init_hierarchy<kmp_int64>(
727  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
728  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
729 }
730 template <>
731 inline void
732 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
733  kmp_uint64 ub, kmp_int64 st) {
734  __kmp_dispatch_init_hierarchy<kmp_uint64>(
735  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
736  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
737 }
738 
739 // free all the hierarchy scheduling memory associated with the team
740 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
741  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
742  for (int i = 0; i < num_disp_buff; ++i) {
743  // type does not matter here so use kmp_int32
744  auto sh =
745  reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
746  &team->t.t_disp_buffer[i]);
747  if (sh->hier) {
748  sh->hier->deallocate();
749  __kmp_free(sh->hier);
750  }
751  }
752 }
753 #endif
754 
755 // UT - unsigned flavor of T, ST - signed flavor of T,
756 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
757 template <typename T>
758 static void
759 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
760  T ub, typename traits_t<T>::signed_t st,
761  typename traits_t<T>::signed_t chunk, int push_ws) {
762  typedef typename traits_t<T>::unsigned_t UT;
763 
764  int active;
765  kmp_info_t *th;
766  kmp_team_t *team;
767  kmp_uint32 my_buffer_index;
768  dispatch_private_info_template<T> *pr;
769  dispatch_shared_info_template<T> volatile *sh;
770 
771  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
772  sizeof(dispatch_private_info));
773  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
774  sizeof(dispatch_shared_info));
775 
776  if (!TCR_4(__kmp_init_parallel))
777  __kmp_parallel_initialize();
778 
779  __kmp_resume_if_soft_paused();
780 
781 #if INCLUDE_SSC_MARKS
782  SSC_MARK_DISPATCH_INIT();
783 #endif
784 #ifdef KMP_DEBUG
785  typedef typename traits_t<T>::signed_t ST;
786  {
787  char *buff;
788  // create format specifiers before the debug output
789  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
790  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
791  traits_t<ST>::spec, traits_t<T>::spec,
792  traits_t<T>::spec, traits_t<ST>::spec);
793  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
794  __kmp_str_free(&buff);
795  }
796 #endif
797  /* setup data */
798  th = __kmp_threads[gtid];
799  team = th->th.th_team;
800  active = !team->t.t_serialized;
801  th->th.th_ident = loc;
802 
803  // Any half-decent optimizer will remove this test when the blocks are empty
804  // since the macros expand to nothing
805  // when statistics are disabled.
806  if (schedule == __kmp_static) {
807  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
808  } else {
809  KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
810  }
811 
812 #if KMP_USE_HIER_SCHED
813  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
814  // Hierarchical scheduling does not work with ordered, so if ordered is
815  // detected, then revert back to threaded scheduling.
816  bool ordered;
817  enum sched_type my_sched = schedule;
818  my_buffer_index = th->th.th_dispatch->th_disp_index;
819  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
820  &th->th.th_dispatch
821  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
822  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
823  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
824  my_sched =
825  (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
826  ordered = (kmp_ord_lower & my_sched);
827  if (pr->flags.use_hier) {
828  if (ordered) {
829  KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
830  "Disabling hierarchical scheduling.\n",
831  gtid));
832  pr->flags.use_hier = FALSE;
833  }
834  }
835  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
836  // Don't use hierarchical for ordered parallel loops and don't
837  // use the runtime hierarchy if one was specified in the program
838  if (!ordered && !pr->flags.use_hier)
839  __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
840  }
841 #endif // KMP_USE_HIER_SCHED
842 
843 #if USE_ITT_BUILD
844  kmp_uint64 cur_chunk = chunk;
845  int itt_need_metadata_reporting =
846  __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
847  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
848  team->t.t_active_level == 1;
849 #endif
850  if (!active) {
851  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
852  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
853  } else {
854  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
855  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
856 
857  my_buffer_index = th->th.th_dispatch->th_disp_index++;
858 
859  /* What happens when number of threads changes, need to resize buffer? */
860  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
861  &th->th.th_dispatch
862  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
863  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
864  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
865  KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
866  my_buffer_index));
867  }
868 
869  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
870 #if USE_ITT_BUILD
871  &cur_chunk,
872 #endif
873  chunk, (T)th->th.th_team_nproc,
874  (T)th->th.th_info.ds.ds_tid);
875  if (active) {
876  if (pr->flags.ordered == 0) {
877  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
878  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
879  } else {
880  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
881  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
882  }
883  }
884 
885  if (active) {
886  /* The name of this buffer should be my_buffer_index when it's free to use
887  * it */
888 
889  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
890  "sh->buffer_index:%d\n",
891  gtid, my_buffer_index, sh->buffer_index));
892  __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
893  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
894  // Note: KMP_WAIT() cannot be used there: buffer index and
895  // my_buffer_index are *always* 32-bit integers.
896  KMP_MB(); /* is this necessary? */
897  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
898  "sh->buffer_index:%d\n",
899  gtid, my_buffer_index, sh->buffer_index));
900 
901  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
902  th->th.th_dispatch->th_dispatch_sh_current =
903  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
904 #if USE_ITT_BUILD
905  if (pr->flags.ordered) {
906  __kmp_itt_ordered_init(gtid);
907  }
908  // Report loop metadata
909  if (itt_need_metadata_reporting) {
910  // Only report metadata by master of active team at level 1
911  kmp_uint64 schedtype = 0;
912  switch (schedule) {
913  case kmp_sch_static_chunked:
914  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
915  break;
916  case kmp_sch_static_greedy:
917  cur_chunk = pr->u.p.parm1;
918  break;
919  case kmp_sch_dynamic_chunked:
920  schedtype = 1;
921  break;
922  case kmp_sch_guided_iterative_chunked:
923  case kmp_sch_guided_analytical_chunked:
924  case kmp_sch_guided_simd:
925  schedtype = 2;
926  break;
927  default:
928  // Should we put this case under "static"?
929  // case kmp_sch_static_steal:
930  schedtype = 3;
931  break;
932  }
933  __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
934  }
935 #if KMP_USE_HIER_SCHED
936  if (pr->flags.use_hier) {
937  pr->u.p.count = 0;
938  pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
939  }
940 #endif // KMP_USER_HIER_SCHED
941 #endif /* USE_ITT_BUILD */
942  }
943 
944 #ifdef KMP_DEBUG
945  {
946  char *buff;
947  // create format specifiers before the debug output
948  buff = __kmp_str_format(
949  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
950  "lb:%%%s ub:%%%s"
951  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
952  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
953  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
954  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
955  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
956  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
957  KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
958  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
959  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
960  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
961  __kmp_str_free(&buff);
962  }
963 #endif
964 #if (KMP_STATIC_STEAL_ENABLED)
965  // It cannot be guaranteed that after execution of a loop with some other
966  // schedule kind all the parm3 variables will contain the same value. Even if
967  // all parm3 will be the same, it still exists a bad case like using 0 and 1
968  // rather than program life-time increment. So the dedicated variable is
969  // required. The 'static_steal_counter' is used.
970  if (schedule == kmp_sch_static_steal) {
971  // Other threads will inspect this variable when searching for a victim.
972  // This is a flag showing that other threads may steal from this thread
973  // since then.
974  volatile T *p = &pr->u.p.static_steal_counter;
975  *p = *p + 1;
976  }
977 #endif // ( KMP_STATIC_STEAL_ENABLED )
978 
979 #if OMPT_SUPPORT && OMPT_OPTIONAL
980  if (ompt_enabled.ompt_callback_work) {
981  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
982  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
983  ompt_callbacks.ompt_callback(ompt_callback_work)(
984  ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
985  &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
986  }
987 #endif
988  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
989 }
990 
991 /* For ordered loops, either __kmp_dispatch_finish() should be called after
992  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
993  * every chunk of iterations. If the ordered section(s) were not executed
994  * for this iteration (or every iteration in this chunk), we need to set the
995  * ordered iteration counters so that the next thread can proceed. */
996 template <typename UT>
997 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
998  typedef typename traits_t<UT>::signed_t ST;
999  kmp_info_t *th = __kmp_threads[gtid];
1000 
1001  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1002  if (!th->th.th_team->t.t_serialized) {
1003 
1004  dispatch_private_info_template<UT> *pr =
1005  reinterpret_cast<dispatch_private_info_template<UT> *>(
1006  th->th.th_dispatch->th_dispatch_pr_current);
1007  dispatch_shared_info_template<UT> volatile *sh =
1008  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1009  th->th.th_dispatch->th_dispatch_sh_current);
1010  KMP_DEBUG_ASSERT(pr);
1011  KMP_DEBUG_ASSERT(sh);
1012  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1013  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1014 
1015  if (pr->ordered_bumped) {
1016  KD_TRACE(
1017  1000,
1018  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1019  gtid));
1020  pr->ordered_bumped = 0;
1021  } else {
1022  UT lower = pr->u.p.ordered_lower;
1023 
1024 #ifdef KMP_DEBUG
1025  {
1026  char *buff;
1027  // create format specifiers before the debug output
1028  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1029  "ordered_iteration:%%%s lower:%%%s\n",
1030  traits_t<UT>::spec, traits_t<UT>::spec);
1031  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1032  __kmp_str_free(&buff);
1033  }
1034 #endif
1035 
1036  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1037  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1038  KMP_MB(); /* is this necessary? */
1039 #ifdef KMP_DEBUG
1040  {
1041  char *buff;
1042  // create format specifiers before the debug output
1043  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1044  "ordered_iteration:%%%s lower:%%%s\n",
1045  traits_t<UT>::spec, traits_t<UT>::spec);
1046  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1047  __kmp_str_free(&buff);
1048  }
1049 #endif
1050 
1051  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1052  } // if
1053  } // if
1054  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1055 }
1056 
1057 #ifdef KMP_GOMP_COMPAT
1058 
1059 template <typename UT>
1060 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1061  typedef typename traits_t<UT>::signed_t ST;
1062  kmp_info_t *th = __kmp_threads[gtid];
1063 
1064  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1065  if (!th->th.th_team->t.t_serialized) {
1066  // int cid;
1067  dispatch_private_info_template<UT> *pr =
1068  reinterpret_cast<dispatch_private_info_template<UT> *>(
1069  th->th.th_dispatch->th_dispatch_pr_current);
1070  dispatch_shared_info_template<UT> volatile *sh =
1071  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1072  th->th.th_dispatch->th_dispatch_sh_current);
1073  KMP_DEBUG_ASSERT(pr);
1074  KMP_DEBUG_ASSERT(sh);
1075  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1076  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1077 
1078  // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1079  UT lower = pr->u.p.ordered_lower;
1080  UT upper = pr->u.p.ordered_upper;
1081  UT inc = upper - lower + 1;
1082 
1083  if (pr->ordered_bumped == inc) {
1084  KD_TRACE(
1085  1000,
1086  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1087  gtid));
1088  pr->ordered_bumped = 0;
1089  } else {
1090  inc -= pr->ordered_bumped;
1091 
1092 #ifdef KMP_DEBUG
1093  {
1094  char *buff;
1095  // create format specifiers before the debug output
1096  buff = __kmp_str_format(
1097  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1098  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1099  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1100  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1101  __kmp_str_free(&buff);
1102  }
1103 #endif
1104 
1105  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1106  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1107 
1108  KMP_MB(); /* is this necessary? */
1109  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1110  "ordered_bumped to zero\n",
1111  gtid));
1112  pr->ordered_bumped = 0;
1114 #ifdef KMP_DEBUG
1115  {
1116  char *buff;
1117  // create format specifiers before the debug output
1118  buff = __kmp_str_format(
1119  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1120  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1121  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1122  traits_t<UT>::spec);
1123  KD_TRACE(1000,
1124  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1125  __kmp_str_free(&buff);
1126  }
1127 #endif
1128 
1129  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1130  }
1131  // }
1132  }
1133  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1134 }
1135 
1136 #endif /* KMP_GOMP_COMPAT */
1137 
1138 template <typename T>
1139 int __kmp_dispatch_next_algorithm(int gtid,
1140  dispatch_private_info_template<T> *pr,
1141  dispatch_shared_info_template<T> volatile *sh,
1142  kmp_int32 *p_last, T *p_lb, T *p_ub,
1143  typename traits_t<T>::signed_t *p_st, T nproc,
1144  T tid) {
1145  typedef typename traits_t<T>::unsigned_t UT;
1146  typedef typename traits_t<T>::signed_t ST;
1147  typedef typename traits_t<T>::floating_t DBL;
1148  int status = 0;
1149  kmp_int32 last = 0;
1150  T start;
1151  ST incr;
1152  UT limit, trip, init;
1153  kmp_info_t *th = __kmp_threads[gtid];
1154  kmp_team_t *team = th->th.th_team;
1155 
1156  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1157  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1158  KMP_DEBUG_ASSERT(pr);
1159  KMP_DEBUG_ASSERT(sh);
1160  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1161 #ifdef KMP_DEBUG
1162  {
1163  char *buff;
1164  // create format specifiers before the debug output
1165  buff =
1166  __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1167  "sh:%%p nproc:%%%s tid:%%%s\n",
1168  traits_t<T>::spec, traits_t<T>::spec);
1169  KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1170  __kmp_str_free(&buff);
1171  }
1172 #endif
1173 
1174  // zero trip count
1175  if (pr->u.p.tc == 0) {
1176  KD_TRACE(10,
1177  ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1178  "zero status:%d\n",
1179  gtid, status));
1180  return 0;
1181  }
1182 
1183  switch (pr->schedule) {
1184 #if (KMP_STATIC_STEAL_ENABLED)
1185  case kmp_sch_static_steal: {
1186  T chunk = pr->u.p.parm1;
1187 
1188  KD_TRACE(100,
1189  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1190  gtid));
1191 
1192  trip = pr->u.p.tc - 1;
1193 
1194  if (traits_t<T>::type_size > 4) {
1195  // use lock for 8-byte and CAS for 4-byte induction
1196  // variable. TODO (optional): check and use 16-byte CAS
1197  kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1198  KMP_DEBUG_ASSERT(lck != NULL);
1199  if (pr->u.p.count < (UT)pr->u.p.ub) {
1200  __kmp_acquire_lock(lck, gtid);
1201  // try to get own chunk of iterations
1202  init = (pr->u.p.count)++;
1203  status = (init < (UT)pr->u.p.ub);
1204  __kmp_release_lock(lck, gtid);
1205  } else {
1206  status = 0; // no own chunks
1207  }
1208  if (!status) { // try to steal
1209  kmp_info_t **other_threads = team->t.t_threads;
1210  int while_limit = pr->u.p.parm3;
1211  int while_index = 0;
1212  // TODO: algorithm of searching for a victim
1213  // should be cleaned up and measured
1214  while ((!status) && (while_limit != ++while_index)) {
1215  T remaining;
1216  T victimIdx = pr->u.p.parm4;
1217  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1218  dispatch_private_info_template<T> *victim =
1219  reinterpret_cast<dispatch_private_info_template<T> *>(
1220  other_threads[victimIdx]
1221  ->th.th_dispatch->th_dispatch_pr_current);
1222  while ((victim == NULL || victim == pr ||
1223  (*(volatile T *)&victim->u.p.static_steal_counter !=
1224  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1225  oldVictimIdx != victimIdx) {
1226  victimIdx = (victimIdx + 1) % nproc;
1227  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1228  other_threads[victimIdx]
1229  ->th.th_dispatch->th_dispatch_pr_current);
1230  }
1231  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1232  *(volatile T *)&pr->u.p.static_steal_counter)) {
1233  continue; // try once more (nproc attempts in total)
1234  // no victim is ready yet to participate in stealing
1235  // because all victims are still in kmp_init_dispatch
1236  }
1237  if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1238  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1239  continue; // not enough chunks to steal, goto next victim
1240  }
1241 
1242  lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1243  KMP_ASSERT(lck != NULL);
1244  __kmp_acquire_lock(lck, gtid);
1245  limit = victim->u.p.ub; // keep initial ub
1246  if (victim->u.p.count >= limit ||
1247  (remaining = limit - victim->u.p.count) < 2) {
1248  __kmp_release_lock(lck, gtid);
1249  pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1250  continue; // not enough chunks to steal
1251  }
1252  // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1253  // by 1
1254  if (remaining > 3) {
1255  // steal 1/4 of remaining
1256  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1257  init = (victim->u.p.ub -= (remaining >> 2));
1258  } else {
1259  // steal 1 chunk of 2 or 3 remaining
1260  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1261  init = (victim->u.p.ub -= 1);
1262  }
1263  __kmp_release_lock(lck, gtid);
1264 
1265  KMP_DEBUG_ASSERT(init + 1 <= limit);
1266  pr->u.p.parm4 = victimIdx; // remember victim to steal from
1267  status = 1;
1268  while_index = 0;
1269  // now update own count and ub with stolen range but init chunk
1270  __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1271  pr->u.p.count = init + 1;
1272  pr->u.p.ub = limit;
1273  __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1274  } // while (search for victim)
1275  } // if (try to find victim and steal)
1276  } else {
1277  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1278  typedef union {
1279  struct {
1280  UT count;
1281  T ub;
1282  } p;
1283  kmp_int64 b;
1284  } union_i4;
1285  // All operations on 'count' or 'ub' must be combined atomically
1286  // together.
1287  {
1288  union_i4 vold, vnew;
1289  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1290  vnew = vold;
1291  vnew.p.count++;
1292  while (!KMP_COMPARE_AND_STORE_ACQ64(
1293  (volatile kmp_int64 *)&pr->u.p.count,
1294  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1295  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1296  KMP_CPU_PAUSE();
1297  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1298  vnew = vold;
1299  vnew.p.count++;
1300  }
1301  vnew = vold;
1302  init = vnew.p.count;
1303  status = (init < (UT)vnew.p.ub);
1304  }
1305 
1306  if (!status) {
1307  kmp_info_t **other_threads = team->t.t_threads;
1308  int while_limit = pr->u.p.parm3;
1309  int while_index = 0;
1310 
1311  // TODO: algorithm of searching for a victim
1312  // should be cleaned up and measured
1313  while ((!status) && (while_limit != ++while_index)) {
1314  union_i4 vold, vnew;
1315  kmp_int32 remaining;
1316  T victimIdx = pr->u.p.parm4;
1317  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1318  dispatch_private_info_template<T> *victim =
1319  reinterpret_cast<dispatch_private_info_template<T> *>(
1320  other_threads[victimIdx]
1321  ->th.th_dispatch->th_dispatch_pr_current);
1322  while ((victim == NULL || victim == pr ||
1323  (*(volatile T *)&victim->u.p.static_steal_counter !=
1324  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1325  oldVictimIdx != victimIdx) {
1326  victimIdx = (victimIdx + 1) % nproc;
1327  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1328  other_threads[victimIdx]
1329  ->th.th_dispatch->th_dispatch_pr_current);
1330  }
1331  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1332  *(volatile T *)&pr->u.p.static_steal_counter)) {
1333  continue; // try once more (nproc attempts in total)
1334  // no victim is ready yet to participate in stealing
1335  // because all victims are still in kmp_init_dispatch
1336  }
1337  pr->u.p.parm4 = victimIdx; // new victim found
1338  while (1) { // CAS loop if victim has enough chunks to steal
1339  vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1340  vnew = vold;
1341 
1342  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1343  if (vnew.p.count >= (UT)vnew.p.ub ||
1344  (remaining = vnew.p.ub - vnew.p.count) < 2) {
1345  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1346  break; // not enough chunks to steal, goto next victim
1347  }
1348  if (remaining > 3) {
1349  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1350  } else {
1351  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1352  }
1353  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1354  // TODO: Should this be acquire or release?
1355  if (KMP_COMPARE_AND_STORE_ACQ64(
1356  (volatile kmp_int64 *)&victim->u.p.count,
1357  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1358  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1359  // stealing succedded
1360  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1361  vold.p.ub - vnew.p.ub);
1362  status = 1;
1363  while_index = 0;
1364  // now update own count and ub
1365  init = vnew.p.ub;
1366  vold.p.count = init + 1;
1367 #if KMP_ARCH_X86
1368  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1369 #else
1370  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1371 #endif
1372  break;
1373  } // if (check CAS result)
1374  KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1375  } // while (try to steal from particular victim)
1376  } // while (search for victim)
1377  } // if (try to find victim and steal)
1378  } // if (4-byte induction variable)
1379  if (!status) {
1380  *p_lb = 0;
1381  *p_ub = 0;
1382  if (p_st != NULL)
1383  *p_st = 0;
1384  } else {
1385  start = pr->u.p.parm2;
1386  init *= chunk;
1387  limit = chunk + init - 1;
1388  incr = pr->u.p.st;
1389  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1390 
1391  KMP_DEBUG_ASSERT(init <= trip);
1392  if ((last = (limit >= trip)) != 0)
1393  limit = trip;
1394  if (p_st != NULL)
1395  *p_st = incr;
1396 
1397  if (incr == 1) {
1398  *p_lb = start + init;
1399  *p_ub = start + limit;
1400  } else {
1401  *p_lb = start + init * incr;
1402  *p_ub = start + limit * incr;
1403  }
1404 
1405  if (pr->flags.ordered) {
1406  pr->u.p.ordered_lower = init;
1407  pr->u.p.ordered_upper = limit;
1408  } // if
1409  } // if
1410  break;
1411  } // case
1412 #endif // ( KMP_STATIC_STEAL_ENABLED )
1413  case kmp_sch_static_balanced: {
1414  KD_TRACE(
1415  10,
1416  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1417  gtid));
1418  /* check if thread has any iteration to do */
1419  if ((status = !pr->u.p.count) != 0) {
1420  pr->u.p.count = 1;
1421  *p_lb = pr->u.p.lb;
1422  *p_ub = pr->u.p.ub;
1423  last = pr->u.p.parm1;
1424  if (p_st != NULL)
1425  *p_st = pr->u.p.st;
1426  } else { /* no iterations to do */
1427  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1428  }
1429  } // case
1430  break;
1431  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1432  merged here */
1433  case kmp_sch_static_chunked: {
1434  T parm1;
1435 
1436  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1437  "kmp_sch_static_[affinity|chunked] case\n",
1438  gtid));
1439  parm1 = pr->u.p.parm1;
1440 
1441  trip = pr->u.p.tc - 1;
1442  init = parm1 * (pr->u.p.count + tid);
1443 
1444  if ((status = (init <= trip)) != 0) {
1445  start = pr->u.p.lb;
1446  incr = pr->u.p.st;
1447  limit = parm1 + init - 1;
1448 
1449  if ((last = (limit >= trip)) != 0)
1450  limit = trip;
1451 
1452  if (p_st != NULL)
1453  *p_st = incr;
1454 
1455  pr->u.p.count += nproc;
1456 
1457  if (incr == 1) {
1458  *p_lb = start + init;
1459  *p_ub = start + limit;
1460  } else {
1461  *p_lb = start + init * incr;
1462  *p_ub = start + limit * incr;
1463  }
1464 
1465  if (pr->flags.ordered) {
1466  pr->u.p.ordered_lower = init;
1467  pr->u.p.ordered_upper = limit;
1468  } // if
1469  } // if
1470  } // case
1471  break;
1472 
1473  case kmp_sch_dynamic_chunked: {
1474  T chunk = pr->u.p.parm1;
1475 
1476  KD_TRACE(
1477  100,
1478  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1479  gtid));
1480 
1481  init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1482  trip = pr->u.p.tc - 1;
1483 
1484  if ((status = (init <= trip)) == 0) {
1485  *p_lb = 0;
1486  *p_ub = 0;
1487  if (p_st != NULL)
1488  *p_st = 0;
1489  } else {
1490  start = pr->u.p.lb;
1491  limit = chunk + init - 1;
1492  incr = pr->u.p.st;
1493 
1494  if ((last = (limit >= trip)) != 0)
1495  limit = trip;
1496 
1497  if (p_st != NULL)
1498  *p_st = incr;
1499 
1500  if (incr == 1) {
1501  *p_lb = start + init;
1502  *p_ub = start + limit;
1503  } else {
1504  *p_lb = start + init * incr;
1505  *p_ub = start + limit * incr;
1506  }
1507 
1508  if (pr->flags.ordered) {
1509  pr->u.p.ordered_lower = init;
1510  pr->u.p.ordered_upper = limit;
1511  } // if
1512  } // if
1513  } // case
1514  break;
1515 
1516  case kmp_sch_guided_iterative_chunked: {
1517  T chunkspec = pr->u.p.parm1;
1518  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1519  "iterative case\n",
1520  gtid));
1521  trip = pr->u.p.tc;
1522  // Start atomic part of calculations
1523  while (1) {
1524  ST remaining; // signed, because can be < 0
1525  init = sh->u.s.iteration; // shared value
1526  remaining = trip - init;
1527  if (remaining <= 0) { // AC: need to compare with 0 first
1528  // nothing to do, don't try atomic op
1529  status = 0;
1530  break;
1531  }
1532  if ((T)remaining <
1533  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1534  // use dynamic-style shcedule
1535  // atomically inrement iterations, get old value
1536  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1537  (ST)chunkspec);
1538  remaining = trip - init;
1539  if (remaining <= 0) {
1540  status = 0; // all iterations got by other threads
1541  } else {
1542  // got some iterations to work on
1543  status = 1;
1544  if ((T)remaining > chunkspec) {
1545  limit = init + chunkspec - 1;
1546  } else {
1547  last = 1; // the last chunk
1548  limit = init + remaining - 1;
1549  } // if
1550  } // if
1551  break;
1552  } // if
1553  limit = init +
1554  (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1555  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1556  (ST)init, (ST)limit)) {
1557  // CAS was successful, chunk obtained
1558  status = 1;
1559  --limit;
1560  break;
1561  } // if
1562  } // while
1563  if (status != 0) {
1564  start = pr->u.p.lb;
1565  incr = pr->u.p.st;
1566  if (p_st != NULL)
1567  *p_st = incr;
1568  *p_lb = start + init * incr;
1569  *p_ub = start + limit * incr;
1570  if (pr->flags.ordered) {
1571  pr->u.p.ordered_lower = init;
1572  pr->u.p.ordered_upper = limit;
1573  } // if
1574  } else {
1575  *p_lb = 0;
1576  *p_ub = 0;
1577  if (p_st != NULL)
1578  *p_st = 0;
1579  } // if
1580  } // case
1581  break;
1582 
1583  case kmp_sch_guided_simd: {
1584  // same as iterative but curr-chunk adjusted to be multiple of given
1585  // chunk
1586  T chunk = pr->u.p.parm1;
1587  KD_TRACE(100,
1588  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1589  gtid));
1590  trip = pr->u.p.tc;
1591  // Start atomic part of calculations
1592  while (1) {
1593  ST remaining; // signed, because can be < 0
1594  init = sh->u.s.iteration; // shared value
1595  remaining = trip - init;
1596  if (remaining <= 0) { // AC: need to compare with 0 first
1597  status = 0; // nothing to do, don't try atomic op
1598  break;
1599  }
1600  KMP_DEBUG_ASSERT(init % chunk == 0);
1601  // compare with K*nproc*(chunk+1), K=2 by default
1602  if ((T)remaining < pr->u.p.parm2) {
1603  // use dynamic-style shcedule
1604  // atomically inrement iterations, get old value
1605  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1606  (ST)chunk);
1607  remaining = trip - init;
1608  if (remaining <= 0) {
1609  status = 0; // all iterations got by other threads
1610  } else {
1611  // got some iterations to work on
1612  status = 1;
1613  if ((T)remaining > chunk) {
1614  limit = init + chunk - 1;
1615  } else {
1616  last = 1; // the last chunk
1617  limit = init + remaining - 1;
1618  } // if
1619  } // if
1620  break;
1621  } // if
1622  // divide by K*nproc
1623  UT span = remaining * (*(double *)&pr->u.p.parm3);
1624  UT rem = span % chunk;
1625  if (rem) // adjust so that span%chunk == 0
1626  span += chunk - rem;
1627  limit = init + span;
1628  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1629  (ST)init, (ST)limit)) {
1630  // CAS was successful, chunk obtained
1631  status = 1;
1632  --limit;
1633  break;
1634  } // if
1635  } // while
1636  if (status != 0) {
1637  start = pr->u.p.lb;
1638  incr = pr->u.p.st;
1639  if (p_st != NULL)
1640  *p_st = incr;
1641  *p_lb = start + init * incr;
1642  *p_ub = start + limit * incr;
1643  if (pr->flags.ordered) {
1644  pr->u.p.ordered_lower = init;
1645  pr->u.p.ordered_upper = limit;
1646  } // if
1647  } else {
1648  *p_lb = 0;
1649  *p_ub = 0;
1650  if (p_st != NULL)
1651  *p_st = 0;
1652  } // if
1653  } // case
1654  break;
1655 
1656  case kmp_sch_guided_analytical_chunked: {
1657  T chunkspec = pr->u.p.parm1;
1658  UT chunkIdx;
1659 #if KMP_USE_X87CONTROL
1660  /* for storing original FPCW value for Windows* OS on
1661  IA-32 architecture 8-byte version */
1662  unsigned int oldFpcw;
1663  unsigned int fpcwSet = 0;
1664 #endif
1665  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1666  "kmp_sch_guided_analytical_chunked case\n",
1667  gtid));
1668 
1669  trip = pr->u.p.tc;
1670 
1671  KMP_DEBUG_ASSERT(nproc > 1);
1672  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1673 
1674  while (1) { /* this while loop is a safeguard against unexpected zero
1675  chunk sizes */
1676  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1677  if (chunkIdx >= (UT)pr->u.p.parm2) {
1678  --trip;
1679  /* use dynamic-style scheduling */
1680  init = chunkIdx * chunkspec + pr->u.p.count;
1681  /* need to verify init > 0 in case of overflow in the above
1682  * calculation */
1683  if ((status = (init > 0 && init <= trip)) != 0) {
1684  limit = init + chunkspec - 1;
1685 
1686  if ((last = (limit >= trip)) != 0)
1687  limit = trip;
1688  }
1689  break;
1690  } else {
1691 /* use exponential-style scheduling */
1692 /* The following check is to workaround the lack of long double precision on
1693  Windows* OS.
1694  This check works around the possible effect that init != 0 for chunkIdx == 0.
1695  */
1696 #if KMP_USE_X87CONTROL
1697  /* If we haven't already done so, save original
1698  FPCW and set precision to 64-bit, as Windows* OS
1699  on IA-32 architecture defaults to 53-bit */
1700  if (!fpcwSet) {
1701  oldFpcw = _control87(0, 0);
1702  _control87(_PC_64, _MCW_PC);
1703  fpcwSet = 0x30000;
1704  }
1705 #endif
1706  if (chunkIdx) {
1707  init = __kmp_dispatch_guided_remaining<T>(
1708  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1709  KMP_DEBUG_ASSERT(init);
1710  init = trip - init;
1711  } else
1712  init = 0;
1713  limit = trip - __kmp_dispatch_guided_remaining<T>(
1714  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1715  KMP_ASSERT(init <= limit);
1716  if (init < limit) {
1717  KMP_DEBUG_ASSERT(limit <= trip);
1718  --limit;
1719  status = 1;
1720  break;
1721  } // if
1722  } // if
1723  } // while (1)
1724 #if KMP_USE_X87CONTROL
1725  /* restore FPCW if necessary
1726  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1727  */
1728  if (fpcwSet && (oldFpcw & fpcwSet))
1729  _control87(oldFpcw, _MCW_PC);
1730 #endif
1731  if (status != 0) {
1732  start = pr->u.p.lb;
1733  incr = pr->u.p.st;
1734  if (p_st != NULL)
1735  *p_st = incr;
1736  *p_lb = start + init * incr;
1737  *p_ub = start + limit * incr;
1738  if (pr->flags.ordered) {
1739  pr->u.p.ordered_lower = init;
1740  pr->u.p.ordered_upper = limit;
1741  }
1742  } else {
1743  *p_lb = 0;
1744  *p_ub = 0;
1745  if (p_st != NULL)
1746  *p_st = 0;
1747  }
1748  } // case
1749  break;
1750 
1751  case kmp_sch_trapezoidal: {
1752  UT index;
1753  T parm2 = pr->u.p.parm2;
1754  T parm3 = pr->u.p.parm3;
1755  T parm4 = pr->u.p.parm4;
1756  KD_TRACE(100,
1757  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1758  gtid));
1759 
1760  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1761 
1762  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1763  trip = pr->u.p.tc - 1;
1764 
1765  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1766  *p_lb = 0;
1767  *p_ub = 0;
1768  if (p_st != NULL)
1769  *p_st = 0;
1770  } else {
1771  start = pr->u.p.lb;
1772  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1773  incr = pr->u.p.st;
1774 
1775  if ((last = (limit >= trip)) != 0)
1776  limit = trip;
1777 
1778  if (p_st != NULL)
1779  *p_st = incr;
1780 
1781  if (incr == 1) {
1782  *p_lb = start + init;
1783  *p_ub = start + limit;
1784  } else {
1785  *p_lb = start + init * incr;
1786  *p_ub = start + limit * incr;
1787  }
1788 
1789  if (pr->flags.ordered) {
1790  pr->u.p.ordered_lower = init;
1791  pr->u.p.ordered_upper = limit;
1792  } // if
1793  } // if
1794  } // case
1795  break;
1796  default: {
1797  status = 0; // to avoid complaints on uninitialized variable use
1798  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1799  KMP_HNT(GetNewerLibrary), // Hint
1800  __kmp_msg_null // Variadic argument list terminator
1801  );
1802  } break;
1803  } // switch
1804  if (p_last)
1805  *p_last = last;
1806 #ifdef KMP_DEBUG
1807  if (pr->flags.ordered) {
1808  char *buff;
1809  // create format specifiers before the debug output
1810  buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1811  "ordered_lower:%%%s ordered_upper:%%%s\n",
1812  traits_t<UT>::spec, traits_t<UT>::spec);
1813  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1814  __kmp_str_free(&buff);
1815  }
1816  {
1817  char *buff;
1818  // create format specifiers before the debug output
1819  buff = __kmp_str_format(
1820  "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1821  "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1822  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1823  KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1824  __kmp_str_free(&buff);
1825  }
1826 #endif
1827  return status;
1828 }
1829 
1830 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1831  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1832  is not called. */
1833 #if OMPT_SUPPORT && OMPT_OPTIONAL
1834 #define OMPT_LOOP_END \
1835  if (status == 0) { \
1836  if (ompt_enabled.ompt_callback_work) { \
1837  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1838  ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1839  ompt_callbacks.ompt_callback(ompt_callback_work)( \
1840  ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1841  &(task_info->task_data), 0, codeptr); \
1842  } \
1843  }
1844 // TODO: implement count
1845 #else
1846 #define OMPT_LOOP_END // no-op
1847 #endif
1848 
1849 #if KMP_STATS_ENABLED
1850 #define KMP_STATS_LOOP_END \
1851  { \
1852  kmp_int64 u, l, t, i; \
1853  l = (kmp_int64)(*p_lb); \
1854  u = (kmp_int64)(*p_ub); \
1855  i = (kmp_int64)(pr->u.p.st); \
1856  if (status == 0) { \
1857  t = 0; \
1858  KMP_POP_PARTITIONED_TIMER(); \
1859  } else if (i == 1) { \
1860  if (u >= l) \
1861  t = u - l + 1; \
1862  else \
1863  t = 0; \
1864  } else if (i < 0) { \
1865  if (l >= u) \
1866  t = (l - u) / (-i) + 1; \
1867  else \
1868  t = 0; \
1869  } else { \
1870  if (u >= l) \
1871  t = (u - l) / i + 1; \
1872  else \
1873  t = 0; \
1874  } \
1875  KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1876  }
1877 #else
1878 #define KMP_STATS_LOOP_END /* Nothing */
1879 #endif
1880 
1881 template <typename T>
1882 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1883  T *p_lb, T *p_ub,
1884  typename traits_t<T>::signed_t *p_st
1885 #if OMPT_SUPPORT && OMPT_OPTIONAL
1886  ,
1887  void *codeptr
1888 #endif
1889  ) {
1890 
1891  typedef typename traits_t<T>::unsigned_t UT;
1892  typedef typename traits_t<T>::signed_t ST;
1893  // This is potentially slightly misleading, schedule(runtime) will appear here
1894  // even if the actual runtme schedule is static. (Which points out a
1895  // disadavantage of schedule(runtime): even when static scheduling is used it
1896  // costs more than a compile time choice to use static scheduling would.)
1897  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1898 
1899  int status;
1900  dispatch_private_info_template<T> *pr;
1901  kmp_info_t *th = __kmp_threads[gtid];
1902  kmp_team_t *team = th->th.th_team;
1903 
1904  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1905  KD_TRACE(
1906  1000,
1907  ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1908  gtid, p_lb, p_ub, p_st, p_last));
1909 
1910  if (team->t.t_serialized) {
1911  /* NOTE: serialize this dispatch becase we are not at the active level */
1912  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1913  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1914  KMP_DEBUG_ASSERT(pr);
1915 
1916  if ((status = (pr->u.p.tc != 0)) == 0) {
1917  *p_lb = 0;
1918  *p_ub = 0;
1919  // if ( p_last != NULL )
1920  // *p_last = 0;
1921  if (p_st != NULL)
1922  *p_st = 0;
1923  if (__kmp_env_consistency_check) {
1924  if (pr->pushed_ws != ct_none) {
1925  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1926  }
1927  }
1928  } else if (pr->flags.nomerge) {
1929  kmp_int32 last;
1930  T start;
1931  UT limit, trip, init;
1932  ST incr;
1933  T chunk = pr->u.p.parm1;
1934 
1935  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1936  gtid));
1937 
1938  init = chunk * pr->u.p.count++;
1939  trip = pr->u.p.tc - 1;
1940 
1941  if ((status = (init <= trip)) == 0) {
1942  *p_lb = 0;
1943  *p_ub = 0;
1944  // if ( p_last != NULL )
1945  // *p_last = 0;
1946  if (p_st != NULL)
1947  *p_st = 0;
1948  if (__kmp_env_consistency_check) {
1949  if (pr->pushed_ws != ct_none) {
1950  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1951  }
1952  }
1953  } else {
1954  start = pr->u.p.lb;
1955  limit = chunk + init - 1;
1956  incr = pr->u.p.st;
1957 
1958  if ((last = (limit >= trip)) != 0) {
1959  limit = trip;
1960 #if KMP_OS_WINDOWS
1961  pr->u.p.last_upper = pr->u.p.ub;
1962 #endif /* KMP_OS_WINDOWS */
1963  }
1964  if (p_last != NULL)
1965  *p_last = last;
1966  if (p_st != NULL)
1967  *p_st = incr;
1968  if (incr == 1) {
1969  *p_lb = start + init;
1970  *p_ub = start + limit;
1971  } else {
1972  *p_lb = start + init * incr;
1973  *p_ub = start + limit * incr;
1974  }
1975 
1976  if (pr->flags.ordered) {
1977  pr->u.p.ordered_lower = init;
1978  pr->u.p.ordered_upper = limit;
1979 #ifdef KMP_DEBUG
1980  {
1981  char *buff;
1982  // create format specifiers before the debug output
1983  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1984  "ordered_lower:%%%s ordered_upper:%%%s\n",
1985  traits_t<UT>::spec, traits_t<UT>::spec);
1986  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1987  pr->u.p.ordered_upper));
1988  __kmp_str_free(&buff);
1989  }
1990 #endif
1991  } // if
1992  } // if
1993  } else {
1994  pr->u.p.tc = 0;
1995  *p_lb = pr->u.p.lb;
1996  *p_ub = pr->u.p.ub;
1997 #if KMP_OS_WINDOWS
1998  pr->u.p.last_upper = *p_ub;
1999 #endif /* KMP_OS_WINDOWS */
2000  if (p_last != NULL)
2001  *p_last = TRUE;
2002  if (p_st != NULL)
2003  *p_st = pr->u.p.st;
2004  } // if
2005 #ifdef KMP_DEBUG
2006  {
2007  char *buff;
2008  // create format specifiers before the debug output
2009  buff = __kmp_str_format(
2010  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2011  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2012  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2013  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2014  __kmp_str_free(&buff);
2015  }
2016 #endif
2017 #if INCLUDE_SSC_MARKS
2018  SSC_MARK_DISPATCH_NEXT();
2019 #endif
2020  OMPT_LOOP_END;
2021  KMP_STATS_LOOP_END;
2022  return status;
2023  } else {
2024  kmp_int32 last = 0;
2025  dispatch_shared_info_template<T> volatile *sh;
2026 
2027  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2028  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2029 
2030  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2031  th->th.th_dispatch->th_dispatch_pr_current);
2032  KMP_DEBUG_ASSERT(pr);
2033  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2034  th->th.th_dispatch->th_dispatch_sh_current);
2035  KMP_DEBUG_ASSERT(sh);
2036 
2037 #if KMP_USE_HIER_SCHED
2038  if (pr->flags.use_hier)
2039  status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2040  else
2041 #endif // KMP_USE_HIER_SCHED
2042  status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2043  p_st, th->th.th_team_nproc,
2044  th->th.th_info.ds.ds_tid);
2045  // status == 0: no more iterations to execute
2046  if (status == 0) {
2047  UT num_done;
2048 
2049  num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2050 #ifdef KMP_DEBUG
2051  {
2052  char *buff;
2053  // create format specifiers before the debug output
2054  buff = __kmp_str_format(
2055  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2056  traits_t<UT>::spec);
2057  KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2058  __kmp_str_free(&buff);
2059  }
2060 #endif
2061 
2062 #if KMP_USE_HIER_SCHED
2063  pr->flags.use_hier = FALSE;
2064 #endif
2065  if ((ST)num_done == th->th.th_team_nproc - 1) {
2066 #if (KMP_STATIC_STEAL_ENABLED)
2067  if (pr->schedule == kmp_sch_static_steal &&
2068  traits_t<T>::type_size > 4) {
2069  int i;
2070  kmp_info_t **other_threads = team->t.t_threads;
2071  // loop complete, safe to destroy locks used for stealing
2072  for (i = 0; i < th->th.th_team_nproc; ++i) {
2073  kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2074  KMP_ASSERT(lck != NULL);
2075  __kmp_destroy_lock(lck);
2076  __kmp_free(lck);
2077  other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2078  }
2079  }
2080 #endif
2081  /* NOTE: release this buffer to be reused */
2082 
2083  KMP_MB(); /* Flush all pending memory write invalidates. */
2084 
2085  sh->u.s.num_done = 0;
2086  sh->u.s.iteration = 0;
2087 
2088  /* TODO replace with general release procedure? */
2089  if (pr->flags.ordered) {
2090  sh->u.s.ordered_iteration = 0;
2091  }
2092 
2093  KMP_MB(); /* Flush all pending memory write invalidates. */
2094 
2095  sh->buffer_index += __kmp_dispatch_num_buffers;
2096  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2097  gtid, sh->buffer_index));
2098 
2099  KMP_MB(); /* Flush all pending memory write invalidates. */
2100 
2101  } // if
2102  if (__kmp_env_consistency_check) {
2103  if (pr->pushed_ws != ct_none) {
2104  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2105  }
2106  }
2107 
2108  th->th.th_dispatch->th_deo_fcn = NULL;
2109  th->th.th_dispatch->th_dxo_fcn = NULL;
2110  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2111  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2112  } // if (status == 0)
2113 #if KMP_OS_WINDOWS
2114  else if (last) {
2115  pr->u.p.last_upper = pr->u.p.ub;
2116  }
2117 #endif /* KMP_OS_WINDOWS */
2118  if (p_last != NULL && status != 0)
2119  *p_last = last;
2120  } // if
2121 
2122 #ifdef KMP_DEBUG
2123  {
2124  char *buff;
2125  // create format specifiers before the debug output
2126  buff = __kmp_str_format(
2127  "__kmp_dispatch_next: T#%%d normal case: "
2128  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2129  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2130  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2131  (p_last ? *p_last : 0), status));
2132  __kmp_str_free(&buff);
2133  }
2134 #endif
2135 #if INCLUDE_SSC_MARKS
2136  SSC_MARK_DISPATCH_NEXT();
2137 #endif
2138  OMPT_LOOP_END;
2139  KMP_STATS_LOOP_END;
2140  return status;
2141 }
2142 
2143 template <typename T>
2144 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2145  kmp_int32 *plastiter, T *plower, T *pupper,
2146  typename traits_t<T>::signed_t incr) {
2147  typedef typename traits_t<T>::unsigned_t UT;
2148  kmp_uint32 team_id;
2149  kmp_uint32 nteams;
2150  UT trip_count;
2151  kmp_team_t *team;
2152  kmp_info_t *th;
2153 
2154  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2155  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2156 #ifdef KMP_DEBUG
2157  typedef typename traits_t<T>::signed_t ST;
2158  {
2159  char *buff;
2160  // create format specifiers before the debug output
2161  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2162  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2163  traits_t<T>::spec, traits_t<T>::spec,
2164  traits_t<ST>::spec, traits_t<T>::spec);
2165  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2166  __kmp_str_free(&buff);
2167  }
2168 #endif
2169 
2170  if (__kmp_env_consistency_check) {
2171  if (incr == 0) {
2172  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2173  loc);
2174  }
2175  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2176  // The loop is illegal.
2177  // Some zero-trip loops maintained by compiler, e.g.:
2178  // for(i=10;i<0;++i) // lower >= upper - run-time check
2179  // for(i=0;i>10;--i) // lower <= upper - run-time check
2180  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2181  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2182  // Compiler does not check the following illegal loops:
2183  // for(i=0;i<10;i+=incr) // where incr<0
2184  // for(i=10;i>0;i-=incr) // where incr<0
2185  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2186  }
2187  }
2188  th = __kmp_threads[gtid];
2189  team = th->th.th_team;
2190  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2191  nteams = th->th.th_teams_size.nteams;
2192  team_id = team->t.t_master_tid;
2193  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2194 
2195  // compute global trip count
2196  if (incr == 1) {
2197  trip_count = *pupper - *plower + 1;
2198  } else if (incr == -1) {
2199  trip_count = *plower - *pupper + 1;
2200  } else if (incr > 0) {
2201  // upper-lower can exceed the limit of signed type
2202  trip_count = (UT)(*pupper - *plower) / incr + 1;
2203  } else {
2204  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2205  }
2206 
2207  if (trip_count <= nteams) {
2208  KMP_DEBUG_ASSERT(
2209  __kmp_static == kmp_sch_static_greedy ||
2210  __kmp_static ==
2211  kmp_sch_static_balanced); // Unknown static scheduling type.
2212  // only some teams get single iteration, others get nothing
2213  if (team_id < trip_count) {
2214  *pupper = *plower = *plower + team_id * incr;
2215  } else {
2216  *plower = *pupper + incr; // zero-trip loop
2217  }
2218  if (plastiter != NULL)
2219  *plastiter = (team_id == trip_count - 1);
2220  } else {
2221  if (__kmp_static == kmp_sch_static_balanced) {
2222  UT chunk = trip_count / nteams;
2223  UT extras = trip_count % nteams;
2224  *plower +=
2225  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2226  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2227  if (plastiter != NULL)
2228  *plastiter = (team_id == nteams - 1);
2229  } else {
2230  T chunk_inc_count =
2231  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2232  T upper = *pupper;
2233  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2234  // Unknown static scheduling type.
2235  *plower += team_id * chunk_inc_count;
2236  *pupper = *plower + chunk_inc_count - incr;
2237  // Check/correct bounds if needed
2238  if (incr > 0) {
2239  if (*pupper < *plower)
2240  *pupper = traits_t<T>::max_value;
2241  if (plastiter != NULL)
2242  *plastiter = *plower <= upper && *pupper > upper - incr;
2243  if (*pupper > upper)
2244  *pupper = upper; // tracker C73258
2245  } else {
2246  if (*pupper > *plower)
2247  *pupper = traits_t<T>::min_value;
2248  if (plastiter != NULL)
2249  *plastiter = *plower >= upper && *pupper < upper - incr;
2250  if (*pupper < upper)
2251  *pupper = upper; // tracker C73258
2252  }
2253  }
2254  }
2255 }
2256 
2257 //-----------------------------------------------------------------------------
2258 // Dispatch routines
2259 // Transfer call to template< type T >
2260 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2261 // T lb, T ub, ST st, ST chunk )
2262 extern "C" {
2263 
2280 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2281  enum sched_type schedule, kmp_int32 lb,
2282  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2283  KMP_DEBUG_ASSERT(__kmp_init_serial);
2284 #if OMPT_SUPPORT && OMPT_OPTIONAL
2285  OMPT_STORE_RETURN_ADDRESS(gtid);
2286 #endif
2287  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2288 }
2292 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2293  enum sched_type schedule, kmp_uint32 lb,
2294  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2295  KMP_DEBUG_ASSERT(__kmp_init_serial);
2296 #if OMPT_SUPPORT && OMPT_OPTIONAL
2297  OMPT_STORE_RETURN_ADDRESS(gtid);
2298 #endif
2299  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2300 }
2301 
2305 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2306  enum sched_type schedule, kmp_int64 lb,
2307  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2308  KMP_DEBUG_ASSERT(__kmp_init_serial);
2309 #if OMPT_SUPPORT && OMPT_OPTIONAL
2310  OMPT_STORE_RETURN_ADDRESS(gtid);
2311 #endif
2312  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2313 }
2314 
2318 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2319  enum sched_type schedule, kmp_uint64 lb,
2320  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2321  KMP_DEBUG_ASSERT(__kmp_init_serial);
2322 #if OMPT_SUPPORT && OMPT_OPTIONAL
2323  OMPT_STORE_RETURN_ADDRESS(gtid);
2324 #endif
2325  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2326 }
2327 
2337 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2338  enum sched_type schedule, kmp_int32 *p_last,
2339  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2340  kmp_int32 chunk) {
2341  KMP_DEBUG_ASSERT(__kmp_init_serial);
2342 #if OMPT_SUPPORT && OMPT_OPTIONAL
2343  OMPT_STORE_RETURN_ADDRESS(gtid);
2344 #endif
2345  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2346  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2347 }
2348 
2349 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2350  enum sched_type schedule, kmp_int32 *p_last,
2351  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2352  kmp_int32 chunk) {
2353  KMP_DEBUG_ASSERT(__kmp_init_serial);
2354 #if OMPT_SUPPORT && OMPT_OPTIONAL
2355  OMPT_STORE_RETURN_ADDRESS(gtid);
2356 #endif
2357  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2358  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2359 }
2360 
2361 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2362  enum sched_type schedule, kmp_int32 *p_last,
2363  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2364  kmp_int64 chunk) {
2365  KMP_DEBUG_ASSERT(__kmp_init_serial);
2366 #if OMPT_SUPPORT && OMPT_OPTIONAL
2367  OMPT_STORE_RETURN_ADDRESS(gtid);
2368 #endif
2369  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2370  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2371 }
2372 
2373 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2374  enum sched_type schedule, kmp_int32 *p_last,
2375  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2376  kmp_int64 chunk) {
2377  KMP_DEBUG_ASSERT(__kmp_init_serial);
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379  OMPT_STORE_RETURN_ADDRESS(gtid);
2380 #endif
2381  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2382  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2383 }
2384 
2398 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2399  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2400 #if OMPT_SUPPORT && OMPT_OPTIONAL
2401  OMPT_STORE_RETURN_ADDRESS(gtid);
2402 #endif
2403  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2404 #if OMPT_SUPPORT && OMPT_OPTIONAL
2405  ,
2406  OMPT_LOAD_RETURN_ADDRESS(gtid)
2407 #endif
2408  );
2409 }
2410 
2414 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2415  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2416  kmp_int32 *p_st) {
2417 #if OMPT_SUPPORT && OMPT_OPTIONAL
2418  OMPT_STORE_RETURN_ADDRESS(gtid);
2419 #endif
2420  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2421 #if OMPT_SUPPORT && OMPT_OPTIONAL
2422  ,
2423  OMPT_LOAD_RETURN_ADDRESS(gtid)
2424 #endif
2425  );
2426 }
2427 
2431 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2432  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2433 #if OMPT_SUPPORT && OMPT_OPTIONAL
2434  OMPT_STORE_RETURN_ADDRESS(gtid);
2435 #endif
2436  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2437 #if OMPT_SUPPORT && OMPT_OPTIONAL
2438  ,
2439  OMPT_LOAD_RETURN_ADDRESS(gtid)
2440 #endif
2441  );
2442 }
2443 
2447 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2448  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2449  kmp_int64 *p_st) {
2450 #if OMPT_SUPPORT && OMPT_OPTIONAL
2451  OMPT_STORE_RETURN_ADDRESS(gtid);
2452 #endif
2453  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2454 #if OMPT_SUPPORT && OMPT_OPTIONAL
2455  ,
2456  OMPT_LOAD_RETURN_ADDRESS(gtid)
2457 #endif
2458  );
2459 }
2460 
2467 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2468  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2469 }
2470 
2474 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2475  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2476 }
2477 
2481 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2482  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2483 }
2484 
2488 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2489  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2490 }
2493 //-----------------------------------------------------------------------------
2494 // Non-template routines from kmp_dispatch.cpp used in other sources
2495 
2496 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2497  return value == checker;
2498 }
2499 
2500 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2501  return value != checker;
2502 }
2503 
2504 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2505  return value < checker;
2506 }
2507 
2508 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2509  return value >= checker;
2510 }
2511 
2512 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2513  return value <= checker;
2514 }
2515 
2516 kmp_uint32
2517 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2518  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2519  void *obj // Higher-level synchronization object, or NULL.
2520  ) {
2521  // note: we may not belong to a team at this point
2522  volatile kmp_uint32 *spin = spinner;
2523  kmp_uint32 check = checker;
2524  kmp_uint32 spins;
2525  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2526  kmp_uint32 r;
2527 
2528  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2529  KMP_INIT_YIELD(spins);
2530  // main wait spin loop
2531  while (!f(r = TCR_4(*spin), check)) {
2532  KMP_FSYNC_SPIN_PREPARE(obj);
2533  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2534  split. It causes problems with infinite recursion because of exit lock */
2535  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2536  __kmp_abort_thread(); */
2537  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2538  }
2539  KMP_FSYNC_SPIN_ACQUIRED(obj);
2540  return r;
2541 }
2542 
2543 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2544  kmp_uint32 (*pred)(void *, kmp_uint32),
2545  void *obj // Higher-level synchronization object, or NULL.
2546  ) {
2547  // note: we may not belong to a team at this point
2548  void *spin = spinner;
2549  kmp_uint32 check = checker;
2550  kmp_uint32 spins;
2551  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2552 
2553  KMP_FSYNC_SPIN_INIT(obj, spin);
2554  KMP_INIT_YIELD(spins);
2555  // main wait spin loop
2556  while (!f(spin, check)) {
2557  KMP_FSYNC_SPIN_PREPARE(obj);
2558  /* if we have waited a bit, or are noversubscribed, yield */
2559  /* pause is in the following code */
2560  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2561  }
2562  KMP_FSYNC_SPIN_ACQUIRED(obj);
2563 }
2564 
2565 } // extern "C"
2566 
2567 #ifdef KMP_GOMP_COMPAT
2568 
2569 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2570  enum sched_type schedule, kmp_int32 lb,
2571  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2572  int push_ws) {
2573  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2574  push_ws);
2575 }
2576 
2577 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2578  enum sched_type schedule, kmp_uint32 lb,
2579  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2580  int push_ws) {
2581  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2582  push_ws);
2583 }
2584 
2585 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2586  enum sched_type schedule, kmp_int64 lb,
2587  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2588  int push_ws) {
2589  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2590  push_ws);
2591 }
2592 
2593 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2594  enum sched_type schedule, kmp_uint64 lb,
2595  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2596  int push_ws) {
2597  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2598  push_ws);
2599 }
2600 
2601 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2602  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2603 }
2604 
2605 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2606  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2607 }
2608 
2609 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2610  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2611 }
2612 
2613 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2614  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2615 }
2616 
2617 #endif /* KMP_GOMP_COMPAT */
2618 
2619 /* ------------------------------------------------------------------------ */
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:887
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:900
sched_type
Definition: kmp.h:336
Definition: kmp.h:222
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)